src/librustdoc/html/highlight.rs - third_party/rust - Git at Google

 // Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 //! Basic syntax highlighting functionality.
 //!
 //! This module uses libsyntax's lexer to provide token-based highlighting for
 //! the HTML documentation generated by rustdoc.
 //!
 //! If you just want to syntax highlighting for a Rust program, then you can use
 //! the `render_inner_with_highlighting` or `render_with_highlighting`
 //! functions. For more advanced use cases (if you want to supply your own css
 //! classes or control how the HTML is generated, or even generate something
 //! other then HTML), then you should implement the the `Writer` trait and use a
 //! `Classifier`.

 use html::escape::Escape;

 use std::fmt::Display;
 use std::io;
 use std::io::prelude::*;

 use syntax::codemap::CodeMap;
 use syntax::parse::lexer::{self, Reader, TokenAndSpan};
 use syntax::parse::token;
 use syntax::parse;
 use syntax_pos::Span;

 /// Highlights `src`, returning the HTML output.
 pub fn render_with_highlighting(src: &str, class: Option<&str>, id: Option<&str>) -> String {
     debug!("highlighting: ================\n{}\n==============", src);
     let sess = parse::ParseSess::new();
     let fm = sess.codemap().new_filemap("<stdin>".to_string(), None, src.to_string());

     let mut out = Vec::new();
     write_header(class, id, &mut out).unwrap();

     let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm),
                                          sess.codemap());
     if let Err(_) = classifier.write_source(&mut out) {
         return format!("<pre>{}</pre>", src);
     }

     write_footer(&mut out).unwrap();
     String::from_utf8_lossy(&out[..]).into_owned()
 }

 /// Highlights `src`, returning the HTML output. Returns only the inner html to
 /// be inserted into an element. C.f., `render_with_highlighting` which includes
 /// an enclosing `<pre>` block.
 pub fn render_inner_with_highlighting(src: &str) -> io::Result<String> {
     let sess = parse::ParseSess::new();
     let fm = sess.codemap().new_filemap("<stdin>".to_string(), None, src.to_string());

     let mut out = Vec::new();
     let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm),
                                          sess.codemap());
     classifier.write_source(&mut out)?;

     Ok(String::from_utf8_lossy(&out).into_owned())
 }

 /// Processes a program (nested in the internal `lexer`), classifying strings of
 /// text by highlighting category (`Class`). Calls out to a `Writer` to write
 /// each span of text in sequence.
 pub struct Classifier<'a> {
     lexer: lexer::StringReader<'a>,
     codemap: &'a CodeMap,

     // State of the classifier.
     in_attribute: bool,
     in_macro: bool,
     in_macro_nonterminal: bool,
 }

 /// How a span of text is classified. Mostly corresponds to token kinds.
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub enum Class {
     None,
     Comment,
     DocComment,
     Attribute,
     KeyWord,
     // Keywords that do pointer/reference stuff.
     RefKeyWord,
     Self_,
     Op,
     Macro,
     MacroNonTerminal,
     String,
     Number,
     Bool,
     Ident,
     Lifetime,
     PreludeTy,
     PreludeVal,
 }

 /// Trait that controls writing the output of syntax highlighting. Users should
 /// implement this trait to customise writing output.
 ///
 /// The classifier will call into the `Writer` implementation as it finds spans
 /// of text to highlight. Exactly how that text should be highlighted is up to
 /// the implementation.
 pub trait Writer {
     /// Called when we start processing a span of text that should be highlighted.
     /// The `Class` argument specifies how it should be highlighted.
     fn enter_span(&mut self, Class) -> io::Result<()>;

     /// Called at the end of a span of highlighted text.
     fn exit_span(&mut self) -> io::Result<()>;

     /// Called for a span of text, usually, but not always, a single token. If
     /// the string of text (`T`) does correspond to a token, then the token will
     /// also be passed. If the text should be highlighted differently from the
     /// surrounding text, then the `Class` argument will be a value other than
     /// `None`.
     /// The following sequences of callbacks are equivalent:
     /// ```plain
     ///     enter_span(Foo), string("text", None), exit_span()
     ///     string("text", Foo)
     /// ```
     /// The latter can be thought of as a shorthand for the former, which is
     /// more flexible.
     fn string<T: Display>(&mut self, T, Class, Option<&TokenAndSpan>) -> io::Result<()>;
 }

 // Implement `Writer` for anthing that can be written to, this just implements
 // the default rustdoc behaviour.
 impl<U: Write> Writer for U {
     fn string<T: Display>(&mut self,
                           text: T,
                           klass: Class,
                           _tas: Option<&TokenAndSpan>)
                           -> io::Result<()> {
         match klass {
             Class::None => write!(self, "{}", text),
             klass => write!(self, "<span class='{}'>{}</span>", klass.rustdoc_class(), text),
         }
     }

     fn enter_span(&mut self, klass: Class) -> io::Result<()> {
         write!(self, "<span class='{}'>", klass.rustdoc_class())
     }

     fn exit_span(&mut self) -> io::Result<()> {
         write!(self, "</span>")
     }
 }

 impl<'a> Classifier<'a> {
     pub fn new(lexer: lexer::StringReader<'a>, codemap: &'a CodeMap) -> Classifier<'a> {
         Classifier {
             lexer: lexer,
             codemap: codemap,
             in_attribute: false,
             in_macro: false,
             in_macro_nonterminal: false,
         }
     }

     /// Exhausts the `lexer` writing the output into `out`.
     ///
     /// The general structure for this method is to iterate over each token,
     /// possibly giving it an HTML span with a class specifying what flavor of token
     /// is used. All source code emission is done as slices from the source map,
     /// not from the tokens themselves, in order to stay true to the original
     /// source.
     pub fn write_source<W: Writer>(&mut self,
                                    out: &mut W)
                                    -> io::Result<()> {
         loop {
             let next = match self.lexer.try_next_token() {
                 Ok(tas) => tas,
                 Err(_) => {
                     self.lexer.emit_fatal_errors();
                     self.lexer.span_diagnostic.struct_warn("Backing out of syntax highlighting")
                                               .note("You probably did not intend to render this \
                                                      as a rust code-block")
                                               .emit();
                     return Err(io::Error::new(io::ErrorKind::Other, ""));
                 }
             };

             if next.tok == token::Eof {
                 break;
             }

             self.write_token(out, next)?;
         }

         Ok(())
     }

     // Handles an individual token from the lexer.
     fn write_token<W: Writer>(&mut self,
                               out: &mut W,
                               tas: TokenAndSpan)
                               -> io::Result<()> {
         let klass = match tas.tok {
             token::Shebang(s) => {
                 out.string(Escape(&s.as_str()), Class::None, Some(&tas))?;
                 return Ok(());
             },

             token::Whitespace => Class::None,
             token::Comment => Class::Comment,
             token::DocComment(..) => Class::DocComment,

             // If this '&' token is directly adjacent to another token, assume
             // that it's the address-of operator instead of the and-operator.
             token::BinOp(token::And) if self.lexer.peek().sp.lo == tas.sp.hi => Class::RefKeyWord,

             // Consider this as part of a macro invocation if there was a
             // leading identifier.
             token::Not if self.in_macro => {
                 self.in_macro = false;
                 Class::Macro
             }

             // Operators.
             token::Eq | token::Lt | token::Le | token::EqEq | token::Ne | token::Ge | token::Gt |
                 token::AndAnd | token::OrOr | token::Not | token::BinOp(..) | token::RArrow |
                 token::BinOpEq(..) | token::FatArrow => Class::Op,

             // Miscellaneous, no highlighting.
             token::Dot | token::DotDot | token::DotDotDot | token::Comma | token::Semi |
                 token::Colon | token::ModSep | token::LArrow | token::OpenDelim(_) |
                 token::CloseDelim(token::Brace) | token::CloseDelim(token::Paren) |
                 token::CloseDelim(token::NoDelim) |
                 token::Question => Class::None,
             token::Dollar => {
                 if self.lexer.peek().tok.is_ident() {
                     self.in_macro_nonterminal = true;
                     Class::MacroNonTerminal
                 } else {
                     Class::None
                 }
             }

             // This is the start of an attribute. We're going to want to
             // continue highlighting it as an attribute until the ending ']' is
             // seen, so skip out early. Down below we terminate the attribute
             // span when we see the ']'.
             token::Pound => {
                 self.in_attribute = true;
                 out.enter_span(Class::Attribute)?;
                 out.string("#", Class::None, None)?;
                 return Ok(());
             }
             token::CloseDelim(token::Bracket) => {
                 if self.in_attribute {
                     self.in_attribute = false;
                     out.string("]", Class::None, None)?;
                     out.exit_span()?;
                     return Ok(());
                 } else {
                     Class::None
                 }
             }

             token::Literal(lit, _suf) => {
                 match lit {
                     // Text literals.
                     token::Byte(..) | token::Char(..) |
                         token::ByteStr(..) | token::ByteStrRaw(..) |
                         token::Str_(..) | token::StrRaw(..) => Class::String,

                     // Number literals.
                     token::Integer(..) | token::Float(..) => Class::Number,
                 }
             }

             // Keywords are also included in the identifier set.
             token::Ident(ident) => {
                 match &*ident.name.as_str() {
                     "ref" | "mut" => Class::RefKeyWord,

                     "self" |"Self" => Class::Self_,
                     "false" | "true" => Class::Bool,

                     "Option" | "Result" => Class::PreludeTy,
                     "Some" | "None" | "Ok" | "Err" => Class::PreludeVal,

                     _ if tas.tok.is_any_keyword() => Class::KeyWord,
                     _ => {
                         if self.in_macro_nonterminal {
                             self.in_macro_nonterminal = false;
                             Class::MacroNonTerminal
                         } else if self.lexer.peek().tok == token::Not {
                             self.in_macro = true;
                             Class::Macro
                         } else {
                             Class::Ident
                         }
                     }
                 }
             }

             // Special macro vars are like keywords.
             token::SpecialVarNt(_) => Class::KeyWord,

             token::Lifetime(..) => Class::Lifetime,

             token::Underscore | token::Eof | token::Interpolated(..) |
             token::MatchNt(..) | token::SubstNt(..) | token::Tilde | token::At => Class::None,
         };

         // Anything that didn't return above is the simple case where we the
         // class just spans a single token, so we can use the `string` method.
         out.string(Escape(&self.snip(tas.sp)), klass, Some(&tas))
     }

     // Helper function to get a snippet from the codemap.
     fn snip(&self, sp: Span) -> String {
         self.codemap.span_to_snippet(sp).unwrap()
     }
 }

 impl Class {
     /// Returns the css class expected by rustdoc for each `Class`.
     pub fn rustdoc_class(self) -> &'static str {
         match self {
             Class::None => "",
             Class::Comment => "comment",
             Class::DocComment => "doccomment",
             Class::Attribute => "attribute",
             Class::KeyWord => "kw",
             Class::RefKeyWord => "kw-2",
             Class::Self_ => "self",
             Class::Op => "op",
             Class::Macro => "macro",
             Class::MacroNonTerminal => "macro-nonterminal",
             Class::String => "string",
             Class::Number => "number",
             Class::Bool => "bool-val",
             Class::Ident => "ident",
             Class::Lifetime => "lifetime",
             Class::PreludeTy => "prelude-ty",
             Class::PreludeVal => "prelude-val",
         }
     }
 }

 fn write_header(class: Option<&str>,
                 id: Option<&str>,
                 out: &mut Write)
                 -> io::Result<()> {
     write!(out, "<pre ")?;
     if let Some(id) = id {
         write!(out, "id='{}' ", id)?;
     }
     write!(out, "class='rust {}'>\n", class.unwrap_or(""))
 }

 fn write_footer(out: &mut Write) -> io::Result<()> {
     write!(out, "</pre>\n")
 }
	// Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	//! Basic syntax highlighting functionality.
	//!
	//! This module uses libsyntax's lexer to provide token-based highlighting for
	//! the HTML documentation generated by rustdoc.
	//!
	//! If you just want to syntax highlighting for a Rust program, then you can use
	//! the `render_inner_with_highlighting` or `render_with_highlighting`
	//! functions. For more advanced use cases (if you want to supply your own css
	//! classes or control how the HTML is generated, or even generate something
	//! other then HTML), then you should implement the the `Writer` trait and use a
	//! `Classifier`.

	use html::escape::Escape;

	use std::fmt::Display;
	use std::io;
	use std::io::prelude::*;

	use syntax::codemap::CodeMap;
	use syntax::parse::lexer::{self, Reader, TokenAndSpan};
	use syntax::parse::token;
	use syntax::parse;
	use syntax_pos::Span;

	/// Highlights `src`, returning the HTML output.
	pub fn render_with_highlighting(src: &str, class: Option<&str>, id: Option<&str>) -> String {
	debug!("highlighting: ================\n{}\n==============", src);
	let sess = parse::ParseSess::new();
	let fm = sess.codemap().new_filemap("<stdin>".to_string(), None, src.to_string());

	let mut out = Vec::new();
	write_header(class, id, &mut out).unwrap();

	let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm),
	sess.codemap());
	if let Err(_) = classifier.write_source(&mut out) {
	return format!("<pre>{}</pre>", src);
	}

	write_footer(&mut out).unwrap();
	String::from_utf8_lossy(&out[..]).into_owned()
	}

	/// Highlights `src`, returning the HTML output. Returns only the inner html to
	/// be inserted into an element. C.f., `render_with_highlighting` which includes
	/// an enclosing `<pre>` block.
	pub fn render_inner_with_highlighting(src: &str) -> io::Result<String> {
	let sess = parse::ParseSess::new();
	let fm = sess.codemap().new_filemap("<stdin>".to_string(), None, src.to_string());

	let mut out = Vec::new();
	let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm),
	sess.codemap());
	classifier.write_source(&mut out)?;

	Ok(String::from_utf8_lossy(&out).into_owned())
	}

	/// Processes a program (nested in the internal `lexer`), classifying strings of
	/// text by highlighting category (`Class`). Calls out to a `Writer` to write
	/// each span of text in sequence.
	pub struct Classifier<'a> {
	lexer: lexer::StringReader<'a>,
	codemap: &'a CodeMap,

	// State of the classifier.
	in_attribute: bool,
	in_macro: bool,
	in_macro_nonterminal: bool,
	}

	/// How a span of text is classified. Mostly corresponds to token kinds.
	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
	pub enum Class {
	None,
	Comment,
	DocComment,
	Attribute,
	KeyWord,
	// Keywords that do pointer/reference stuff.
	RefKeyWord,
	Self_,
	Op,
	Macro,
	MacroNonTerminal,
	String,
	Number,
	Bool,
	Ident,
	Lifetime,
	PreludeTy,
	PreludeVal,
	}

	/// Trait that controls writing the output of syntax highlighting. Users should
	/// implement this trait to customise writing output.
	///
	/// The classifier will call into the `Writer` implementation as it finds spans
	/// of text to highlight. Exactly how that text should be highlighted is up to
	/// the implementation.
	pub trait Writer {
	/// Called when we start processing a span of text that should be highlighted.
	/// The `Class` argument specifies how it should be highlighted.
	fn enter_span(&mut self, Class) -> io::Result<()>;

	/// Called at the end of a span of highlighted text.
	fn exit_span(&mut self) -> io::Result<()>;

	/// Called for a span of text, usually, but not always, a single token. If
	/// the string of text (`T`) does correspond to a token, then the token will
	/// also be passed. If the text should be highlighted differently from the
	/// surrounding text, then the `Class` argument will be a value other than
	/// `None`.
	/// The following sequences of callbacks are equivalent:
	/// ```plain
	/// enter_span(Foo), string("text", None), exit_span()
	/// string("text", Foo)
	/// ```
	/// The latter can be thought of as a shorthand for the former, which is
	/// more flexible.
	fn string<T: Display>(&mut self, T, Class, Option<&TokenAndSpan>) -> io::Result<()>;
	}

	// Implement `Writer` for anthing that can be written to, this just implements
	// the default rustdoc behaviour.
	impl<U: Write> Writer for U {
	fn string<T: Display>(&mut self,
	text: T,
	klass: Class,
	_tas: Option<&TokenAndSpan>)
	-> io::Result<()> {
	match klass {
	Class::None => write!(self, "{}", text),
	klass => write!(self, "<span class='{}'>{}</span>", klass.rustdoc_class(), text),
	}
	}

	fn enter_span(&mut self, klass: Class) -> io::Result<()> {
	write!(self, "<span class='{}'>", klass.rustdoc_class())
	}

	fn exit_span(&mut self) -> io::Result<()> {
	write!(self, "</span>")
	}
	}

	impl<'a> Classifier<'a> {
	pub fn new(lexer: lexer::StringReader<'a>, codemap: &'a CodeMap) -> Classifier<'a> {
	Classifier {
	lexer: lexer,
	codemap: codemap,
	in_attribute: false,
	in_macro: false,
	in_macro_nonterminal: false,
	}
	}

	/// Exhausts the `lexer` writing the output into `out`.
	///
	/// The general structure for this method is to iterate over each token,
	/// possibly giving it an HTML span with a class specifying what flavor of token
	/// is used. All source code emission is done as slices from the source map,
	/// not from the tokens themselves, in order to stay true to the original
	/// source.
	pub fn write_source<W: Writer>(&mut self,
	out: &mut W)
	-> io::Result<()> {
	loop {
	let next = match self.lexer.try_next_token() {
	Ok(tas) => tas,
	Err(_) => {
	self.lexer.emit_fatal_errors();
	self.lexer.span_diagnostic.struct_warn("Backing out of syntax highlighting")
	.note("You probably did not intend to render this \
	as a rust code-block")
	.emit();
	return Err(io::Error::new(io::ErrorKind::Other, ""));
	}
	};

	if next.tok == token::Eof {
	break;
	}

	self.write_token(out, next)?;
	}

	Ok(())
	}

	// Handles an individual token from the lexer.
	fn write_token<W: Writer>(&mut self,
	out: &mut W,
	tas: TokenAndSpan)
	-> io::Result<()> {
	let klass = match tas.tok {
	token::Shebang(s) => {
	out.string(Escape(&s.as_str()), Class::None, Some(&tas))?;
	return Ok(());
	},

	token::Whitespace => Class::None,
	token::Comment => Class::Comment,
	token::DocComment(..) => Class::DocComment,

	// If this '&' token is directly adjacent to another token, assume
	// that it's the address-of operator instead of the and-operator.
	token::BinOp(token::And) if self.lexer.peek().sp.lo == tas.sp.hi => Class::RefKeyWord,

	// Consider this as part of a macro invocation if there was a
	// leading identifier.
	token::Not if self.in_macro => {
	self.in_macro = false;
	Class::Macro
	}

	// Operators.
	token::Eq \| token::Lt \| token::Le \| token::EqEq \| token::Ne \| token::Ge \| token::Gt \|
	token::AndAnd \| token::OrOr \| token::Not \| token::BinOp(..) \| token::RArrow \|
	token::BinOpEq(..) \| token::FatArrow => Class::Op,

	// Miscellaneous, no highlighting.
	token::Dot \| token::DotDot \| token::DotDotDot \| token::Comma \| token::Semi \|
	token::Colon \| token::ModSep \| token::LArrow \| token::OpenDelim(_) \|
	token::CloseDelim(token::Brace) \| token::CloseDelim(token::Paren) \|
	token::CloseDelim(token::NoDelim) \|
	token::Question => Class::None,
	token::Dollar => {
	if self.lexer.peek().tok.is_ident() {
	self.in_macro_nonterminal = true;
	Class::MacroNonTerminal
	} else {
	Class::None
	}
	}

	// This is the start of an attribute. We're going to want to
	// continue highlighting it as an attribute until the ending ']' is
	// seen, so skip out early. Down below we terminate the attribute
	// span when we see the ']'.
	token::Pound => {
	self.in_attribute = true;
	out.enter_span(Class::Attribute)?;
	out.string("#", Class::None, None)?;
	return Ok(());
	}
	token::CloseDelim(token::Bracket) => {
	if self.in_attribute {
	self.in_attribute = false;
	out.string("]", Class::None, None)?;
	out.exit_span()?;
	return Ok(());
	} else {
	Class::None
	}
	}

	token::Literal(lit, _suf) => {
	match lit {
	// Text literals.
	token::Byte(..) \| token::Char(..) \|
	token::ByteStr(..) \| token::ByteStrRaw(..) \|
	token::Str_(..) \| token::StrRaw(..) => Class::String,

	// Number literals.
	token::Integer(..) \| token::Float(..) => Class::Number,
	}
	}

	// Keywords are also included in the identifier set.
	token::Ident(ident) => {
	match &*ident.name.as_str() {
	"ref" \| "mut" => Class::RefKeyWord,

	"self" \|"Self" => Class::Self_,
	"false" \| "true" => Class::Bool,

	"Option" \| "Result" => Class::PreludeTy,
	"Some" \| "None" \| "Ok" \| "Err" => Class::PreludeVal,

	_ if tas.tok.is_any_keyword() => Class::KeyWord,
	_ => {
	if self.in_macro_nonterminal {
	self.in_macro_nonterminal = false;
	Class::MacroNonTerminal
	} else if self.lexer.peek().tok == token::Not {
	self.in_macro = true;
	Class::Macro
	} else {
	Class::Ident
	}
	}
	}
	}

	// Special macro vars are like keywords.
	token::SpecialVarNt(_) => Class::KeyWord,

	token::Lifetime(..) => Class::Lifetime,

	token::Underscore \| token::Eof \| token::Interpolated(..) \|
	token::MatchNt(..) \| token::SubstNt(..) \| token::Tilde \| token::At => Class::None,
	};

	// Anything that didn't return above is the simple case where we the
	// class just spans a single token, so we can use the `string` method.
	out.string(Escape(&self.snip(tas.sp)), klass, Some(&tas))
	}

	// Helper function to get a snippet from the codemap.
	fn snip(&self, sp: Span) -> String {
	self.codemap.span_to_snippet(sp).unwrap()
	}
	}

	impl Class {
	/// Returns the css class expected by rustdoc for each `Class`.
	pub fn rustdoc_class(self) -> &'static str {
	match self {
	Class::None => "",
	Class::Comment => "comment",
	Class::DocComment => "doccomment",
	Class::Attribute => "attribute",
	Class::KeyWord => "kw",
	Class::RefKeyWord => "kw-2",
	Class::Self_ => "self",
	Class::Op => "op",
	Class::Macro => "macro",
	Class::MacroNonTerminal => "macro-nonterminal",
	Class::String => "string",
	Class::Number => "number",
	Class::Bool => "bool-val",
	Class::Ident => "ident",
	Class::Lifetime => "lifetime",
	Class::PreludeTy => "prelude-ty",
	Class::PreludeVal => "prelude-val",
	}
	}
	}

	fn write_header(class: Option<&str>,
	id: Option<&str>,
	out: &mut Write)
	-> io::Result<()> {
	write!(out, "<pre ")?;
	if let Some(id) = id {
	write!(out, "id='{}' ", id)?;
	}
	write!(out, "class='rust {}'>\n", class.unwrap_or(""))
	}

	fn write_footer(out: &mut Write) -> io::Result<()> {
	write!(out, "</pre>\n")
	}