| // Copyright 2014 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| #![feature(plugin, rustc_private)] |
| |
| extern crate syntax; |
| extern crate rustc; |
| |
| #[macro_use] |
| extern crate log; |
| |
| use std::collections::HashMap; |
| use std::env; |
| use std::fs::File; |
| use std::io::{BufRead, Read}; |
| use std::path::Path; |
| |
| use syntax::parse; |
| use syntax::parse::lexer; |
| use rustc::dep_graph::DepGraph; |
| use rustc::session::{self, config}; |
| use rustc::middle::cstore::DummyCrateStore; |
| |
| use std::rc::Rc; |
| use syntax::ast; |
| use syntax::ast::Name; |
| use syntax::codemap; |
| use syntax::parse::token::{self, BinOpToken, DelimToken, Lit, Token}; |
| use syntax::parse::lexer::TokenAndSpan; |
| use syntax_pos::Pos; |
| |
| fn parse_token_list(file: &str) -> HashMap<String, token::Token> { |
| fn id() -> token::Token { |
| Token::Ident(ast::Ident::with_empty_ctxt(Name(0))) |
| } |
| |
| let mut res = HashMap::new(); |
| |
| res.insert("-1".to_string(), Token::Eof); |
| |
| for line in file.split('\n') { |
| let eq = match line.trim().rfind('=') { |
| Some(val) => val, |
| None => continue |
| }; |
| |
| let val = &line[..eq]; |
| let num = &line[eq + 1..]; |
| |
| let tok = match val { |
| "SHR" => Token::BinOp(BinOpToken::Shr), |
| "DOLLAR" => Token::Dollar, |
| "LT" => Token::Lt, |
| "STAR" => Token::BinOp(BinOpToken::Star), |
| "FLOAT_SUFFIX" => id(), |
| "INT_SUFFIX" => id(), |
| "SHL" => Token::BinOp(BinOpToken::Shl), |
| "LBRACE" => Token::OpenDelim(DelimToken::Brace), |
| "RARROW" => Token::RArrow, |
| "LIT_STR" => Token::Literal(Lit::Str_(Name(0)), None), |
| "DOTDOT" => Token::DotDot, |
| "MOD_SEP" => Token::ModSep, |
| "DOTDOTDOT" => Token::DotDotDot, |
| "NOT" => Token::Not, |
| "AND" => Token::BinOp(BinOpToken::And), |
| "LPAREN" => Token::OpenDelim(DelimToken::Paren), |
| "ANDAND" => Token::AndAnd, |
| "AT" => Token::At, |
| "LBRACKET" => Token::OpenDelim(DelimToken::Bracket), |
| "LIT_STR_RAW" => Token::Literal(Lit::StrRaw(Name(0), 0), None), |
| "RPAREN" => Token::CloseDelim(DelimToken::Paren), |
| "SLASH" => Token::BinOp(BinOpToken::Slash), |
| "COMMA" => Token::Comma, |
| "LIFETIME" => Token::Lifetime(ast::Ident::with_empty_ctxt(Name(0))), |
| "CARET" => Token::BinOp(BinOpToken::Caret), |
| "TILDE" => Token::Tilde, |
| "IDENT" => id(), |
| "PLUS" => Token::BinOp(BinOpToken::Plus), |
| "LIT_CHAR" => Token::Literal(Lit::Char(Name(0)), None), |
| "LIT_BYTE" => Token::Literal(Lit::Byte(Name(0)), None), |
| "EQ" => Token::Eq, |
| "RBRACKET" => Token::CloseDelim(DelimToken::Bracket), |
| "COMMENT" => Token::Comment, |
| "DOC_COMMENT" => Token::DocComment(Name(0)), |
| "DOT" => Token::Dot, |
| "EQEQ" => Token::EqEq, |
| "NE" => Token::Ne, |
| "GE" => Token::Ge, |
| "PERCENT" => Token::BinOp(BinOpToken::Percent), |
| "RBRACE" => Token::CloseDelim(DelimToken::Brace), |
| "BINOP" => Token::BinOp(BinOpToken::Plus), |
| "POUND" => Token::Pound, |
| "OROR" => Token::OrOr, |
| "LIT_INTEGER" => Token::Literal(Lit::Integer(Name(0)), None), |
| "BINOPEQ" => Token::BinOpEq(BinOpToken::Plus), |
| "LIT_FLOAT" => Token::Literal(Lit::Float(Name(0)), None), |
| "WHITESPACE" => Token::Whitespace, |
| "UNDERSCORE" => Token::Underscore, |
| "MINUS" => Token::BinOp(BinOpToken::Minus), |
| "SEMI" => Token::Semi, |
| "COLON" => Token::Colon, |
| "FAT_ARROW" => Token::FatArrow, |
| "OR" => Token::BinOp(BinOpToken::Or), |
| "GT" => Token::Gt, |
| "LE" => Token::Le, |
| "LIT_BINARY" => Token::Literal(Lit::ByteStr(Name(0)), None), |
| "LIT_BINARY_RAW" => Token::Literal(Lit::ByteStrRaw(Name(0), 0), None), |
| "QUESTION" => Token::Question, |
| "SHEBANG" => Token::Shebang(Name(0)), |
| _ => continue, |
| }; |
| |
| res.insert(num.to_string(), tok); |
| } |
| |
| debug!("Token map: {:?}", res); |
| res |
| } |
| |
| fn str_to_binop(s: &str) -> token::BinOpToken { |
| match s { |
| "+" => BinOpToken::Plus, |
| "/" => BinOpToken::Slash, |
| "-" => BinOpToken::Minus, |
| "*" => BinOpToken::Star, |
| "%" => BinOpToken::Percent, |
| "^" => BinOpToken::Caret, |
| "&" => BinOpToken::And, |
| "|" => BinOpToken::Or, |
| "<<" => BinOpToken::Shl, |
| ">>" => BinOpToken::Shr, |
| _ => panic!("Bad binop str `{}`", s), |
| } |
| } |
| |
| /// Assuming a string/byte string literal, strip out the leading/trailing |
| /// hashes and surrounding quotes/raw/byte prefix. |
| fn fix(mut lit: &str) -> ast::Name { |
| let prefix: Vec<char> = lit.chars().take(2).collect(); |
| if prefix[0] == 'r' { |
| if prefix[1] == 'b' { |
| lit = &lit[2..] |
| } else { |
| lit = &lit[1..]; |
| } |
| } else if prefix[0] == 'b' { |
| lit = &lit[1..]; |
| } |
| |
| let leading_hashes = count(lit); |
| |
| // +1/-1 to adjust for single quotes |
| parse::token::intern(&lit[leading_hashes + 1..lit.len() - leading_hashes - 1]) |
| } |
| |
| /// Assuming a char/byte literal, strip the 'b' prefix and the single quotes. |
| fn fixchar(mut lit: &str) -> ast::Name { |
| let prefix = lit.chars().next().unwrap(); |
| if prefix == 'b' { |
| lit = &lit[1..]; |
| } |
| |
| parse::token::intern(&lit[1..lit.len() - 1]) |
| } |
| |
| fn count(lit: &str) -> usize { |
| lit.chars().take_while(|c| *c == '#').count() |
| } |
| |
| fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize], |
| has_bom: bool) |
| -> TokenAndSpan { |
| // old regex: |
| // \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+] |
| let start = s.find("[@").unwrap(); |
| let comma = start + s[start..].find(",").unwrap(); |
| let colon = comma + s[comma..].find(":").unwrap(); |
| let content_start = colon + s[colon..].find("='").unwrap(); |
| // Use rfind instead of find, because we don't want to stop at the content |
| let content_end = content_start + s[content_start..].rfind("',<").unwrap(); |
| let toknum_end = content_end + s[content_end..].find(">,").unwrap(); |
| |
| let start = &s[comma + 1 .. colon]; |
| let end = &s[colon + 1 .. content_start]; |
| let content = &s[content_start + 2 .. content_end]; |
| let toknum = &s[content_end + 3 .. toknum_end]; |
| |
| let not_found = format!("didn't find token {:?} in the map", toknum); |
| let proto_tok = tokens.get(toknum).expect(¬_found[..]); |
| |
| let nm = parse::token::intern(content); |
| |
| debug!("What we got: content (`{}`), proto: {:?}", content, proto_tok); |
| |
| let real_tok = match *proto_tok { |
| Token::BinOp(..) => Token::BinOp(str_to_binop(content)), |
| Token::BinOpEq(..) => Token::BinOpEq(str_to_binop(&content[..content.len() - 1])), |
| Token::Literal(Lit::Str_(..), n) => Token::Literal(Lit::Str_(fix(content)), n), |
| Token::Literal(Lit::StrRaw(..), n) => Token::Literal(Lit::StrRaw(fix(content), |
| count(content)), n), |
| Token::Literal(Lit::Char(..), n) => Token::Literal(Lit::Char(fixchar(content)), n), |
| Token::Literal(Lit::Byte(..), n) => Token::Literal(Lit::Byte(fixchar(content)), n), |
| Token::DocComment(..) => Token::DocComment(nm), |
| Token::Literal(Lit::Integer(..), n) => Token::Literal(Lit::Integer(nm), n), |
| Token::Literal(Lit::Float(..), n) => Token::Literal(Lit::Float(nm), n), |
| Token::Literal(Lit::ByteStr(..), n) => Token::Literal(Lit::ByteStr(nm), n), |
| Token::Literal(Lit::ByteStrRaw(..), n) => Token::Literal(Lit::ByteStrRaw(fix(content), |
| count(content)), n), |
| Token::Ident(..) => Token::Ident(ast::Ident::with_empty_ctxt(nm)), |
| Token::Lifetime(..) => Token::Lifetime(ast::Ident::with_empty_ctxt(nm)), |
| ref t => t.clone() |
| }; |
| |
| let start_offset = if real_tok == Token::Eof { |
| 1 |
| } else { |
| 0 |
| }; |
| |
| let offset = if has_bom { 1 } else { 0 }; |
| |
| let mut lo = start.parse::<u32>().unwrap() - start_offset - offset; |
| let mut hi = end.parse::<u32>().unwrap() + 1 - offset; |
| |
| // Adjust the span: For each surrogate pair already encountered, subtract one position. |
| lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32; |
| hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32; |
| |
| let sp = syntax_pos::Span { |
| lo: syntax_pos::BytePos(lo), |
| hi: syntax_pos::BytePos(hi), |
| expn_id: syntax_pos::NO_EXPANSION |
| }; |
| |
| TokenAndSpan { |
| tok: real_tok, |
| sp: sp |
| } |
| } |
| |
| fn tok_cmp(a: &token::Token, b: &token::Token) -> bool { |
| match a { |
| &Token::Ident(id) => match b { |
| &Token::Ident(id2) => id == id2, |
| _ => false |
| }, |
| _ => a == b |
| } |
| } |
| |
| fn span_cmp(antlr_sp: codemap::Span, rust_sp: codemap::Span, cm: &codemap::CodeMap) -> bool { |
| antlr_sp.expn_id == rust_sp.expn_id && |
| antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() && |
| antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize() |
| } |
| |
| fn main() { |
| fn next(r: &mut lexer::StringReader) -> TokenAndSpan { |
| use syntax::parse::lexer::Reader; |
| r.next_token() |
| } |
| |
| let mut args = env::args().skip(1); |
| let filename = args.next().unwrap(); |
| if filename.find("parse-fail").is_some() { |
| return; |
| } |
| |
| // Rust's lexer |
| let mut code = String::new(); |
| File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap(); |
| |
| let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate() |
| .filter(|&(_, c)| c as usize > 0xFFFF) |
| .map(|(n, _)| n) |
| .enumerate() |
| .map(|(x, n)| x + n) |
| .collect(); |
| |
| let has_bom = code.starts_with("\u{feff}"); |
| |
| debug!("Pairs: {:?}", surrogate_pairs_pos); |
| |
| let options = config::basic_options(); |
| let session = session::build_session(options, &DepGraph::new(false), None, |
| syntax::diagnostics::registry::Registry::new(&[]), |
| Rc::new(DummyCrateStore)); |
| let filemap = session.parse_sess.codemap().new_filemap(String::from("<n/a>"), code); |
| let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap); |
| let cm = session.codemap(); |
| |
| // ANTLR |
| let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap(); |
| let mut token_list = String::new(); |
| token_file.read_to_string(&mut token_list).unwrap(); |
| let token_map = parse_token_list(&token_list[..]); |
| |
| let stdin = std::io::stdin(); |
| let lock = stdin.lock(); |
| let lines = lock.lines(); |
| let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(), |
| &token_map, |
| &surrogate_pairs_pos[..], |
| has_bom)); |
| |
| for antlr_tok in antlr_tokens { |
| let rustc_tok = next(&mut lexer); |
| if rustc_tok.tok == Token::Eof && antlr_tok.tok == Token::Eof { |
| continue |
| } |
| |
| assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans", |
| rustc_tok, |
| antlr_tok); |
| |
| macro_rules! matches { |
| ( $($x:pat),+ ) => ( |
| match rustc_tok.tok { |
| $($x => match antlr_tok.tok { |
| $x => { |
| if !tok_cmp(&rustc_tok.tok, &antlr_tok.tok) { |
| // FIXME #15677: needs more robust escaping in |
| // antlr |
| warn!("Different names for {:?} and {:?}", rustc_tok, antlr_tok); |
| } |
| } |
| _ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok) |
| },)* |
| ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok) |
| } |
| ) |
| } |
| |
| matches!( |
| Token::Literal(Lit::Byte(..), _), |
| Token::Literal(Lit::Char(..), _), |
| Token::Literal(Lit::Integer(..), _), |
| Token::Literal(Lit::Float(..), _), |
| Token::Literal(Lit::Str_(..), _), |
| Token::Literal(Lit::StrRaw(..), _), |
| Token::Literal(Lit::ByteStr(..), _), |
| Token::Literal(Lit::ByteStrRaw(..), _), |
| Token::Ident(..), |
| Token::Lifetime(..), |
| Token::Interpolated(..), |
| Token::DocComment(..), |
| Token::Shebang(..) |
| ); |
| } |
| } |