| // Copyright 2015 Google Inc. All rights reserved. |
| // |
| // Permission is hereby granted, free of charge, to any person obtaining a copy |
| // of this software and associated documentation files (the "Software"), to deal |
| // in the Software without restriction, including without limitation the rights |
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| // copies of the Software, and to permit persons to whom the Software is |
| // furnished to do so, subject to the following conditions: |
| // |
| // The above copyright notice and this permission notice shall be included in |
| // all copies or substantial portions of the Software. |
| // |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| // THE SOFTWARE. |
| |
| //! Scanners for fragments of CommonMark syntax |
| |
| use std::char; |
| use std::convert::TryInto; |
| |
| use crate::entities; |
| use crate::parse::{Alignment, HtmlScanGuard, LinkType}; |
| pub use crate::puncttable::{is_ascii_punctuation, is_punctuation}; |
| use crate::strings::CowStr; |
| |
| use memchr::memchr; |
| |
| // sorted for binary search |
| const HTML_TAGS: [&str; 62] = [ |
| "address", |
| "article", |
| "aside", |
| "base", |
| "basefont", |
| "blockquote", |
| "body", |
| "caption", |
| "center", |
| "col", |
| "colgroup", |
| "dd", |
| "details", |
| "dialog", |
| "dir", |
| "div", |
| "dl", |
| "dt", |
| "fieldset", |
| "figcaption", |
| "figure", |
| "footer", |
| "form", |
| "frame", |
| "frameset", |
| "h1", |
| "h2", |
| "h3", |
| "h4", |
| "h5", |
| "h6", |
| "head", |
| "header", |
| "hr", |
| "html", |
| "iframe", |
| "legend", |
| "li", |
| "link", |
| "main", |
| "menu", |
| "menuitem", |
| "nav", |
| "noframes", |
| "ol", |
| "optgroup", |
| "option", |
| "p", |
| "param", |
| "section", |
| "source", |
| "summary", |
| "table", |
| "tbody", |
| "td", |
| "tfoot", |
| "th", |
| "thead", |
| "title", |
| "tr", |
| "track", |
| "ul", |
| ]; |
| |
| /// Analysis of the beginning of a line, including indentation and container |
| /// markers. |
| #[derive(Clone)] |
| pub struct LineStart<'a> { |
| bytes: &'a [u8], |
| tab_start: usize, |
| ix: usize, |
| spaces_remaining: usize, |
| // no thematic breaks can occur before this offset. |
| // this prevents scanning over and over up to a certain point |
| min_hrule_offset: usize, |
| } |
| |
| impl<'a> LineStart<'a> { |
| pub(crate) fn new(bytes: &[u8]) -> LineStart { |
| LineStart { |
| bytes, |
| tab_start: 0, |
| ix: 0, |
| spaces_remaining: 0, |
| min_hrule_offset: 0, |
| } |
| } |
| |
| /// Try to scan a number of spaces. |
| /// |
| /// Returns true if all spaces were consumed. |
| /// |
| /// Note: consumes some spaces even if not successful. |
| pub(crate) fn scan_space(&mut self, n_space: usize) -> bool { |
| self.scan_space_inner(n_space) == 0 |
| } |
| |
| /// Scan a number of spaces up to a maximum. |
| /// |
| /// Returns number of spaces scanned. |
| pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize { |
| n_space - self.scan_space_inner(n_space) |
| } |
| |
| /// Returns unused remainder of spaces. |
| fn scan_space_inner(&mut self, mut n_space: usize) -> usize { |
| let n_from_remaining = self.spaces_remaining.min(n_space); |
| self.spaces_remaining -= n_from_remaining; |
| n_space -= n_from_remaining; |
| while n_space > 0 && self.ix < self.bytes.len() { |
| match self.bytes[self.ix] { |
| b' ' => { |
| self.ix += 1; |
| n_space -= 1; |
| } |
| b'\t' => { |
| let spaces = 4 - (self.ix - self.tab_start) % 4; |
| self.ix += 1; |
| self.tab_start = self.ix; |
| let n = spaces.min(n_space); |
| n_space -= n; |
| self.spaces_remaining = spaces - n; |
| } |
| _ => break, |
| } |
| } |
| n_space |
| } |
| |
| /// Scan all available ASCII whitespace (not including eol). |
| pub(crate) fn scan_all_space(&mut self) { |
| self.spaces_remaining = 0; |
| self.ix += self.bytes[self.ix..] |
| .iter() |
| .take_while(|&&b| b == b' ' || b == b'\t') |
| .count(); |
| } |
| |
| /// Determine whether we're at end of line (includes end of file). |
| pub(crate) fn is_at_eol(&self) -> bool { |
| self.bytes |
| .get(self.ix) |
| .map(|&c| c == b'\r' || c == b'\n') |
| .unwrap_or(true) |
| } |
| |
| fn scan_ch(&mut self, c: u8) -> bool { |
| if self.ix < self.bytes.len() && self.bytes[self.ix] == c { |
| self.ix += 1; |
| true |
| } else { |
| false |
| } |
| } |
| |
| pub(crate) fn scan_blockquote_marker(&mut self) -> bool { |
| let save = self.clone(); |
| let _ = self.scan_space(3); |
| if self.scan_ch(b'>') { |
| let _ = self.scan_space(1); |
| true |
| } else { |
| *self = save; |
| false |
| } |
| } |
| |
| /// Scan a list marker. |
| /// |
| /// Return value is the character, the start index, and the indent in spaces. |
| /// For ordered list markers, the character will be one of b'.' or b')'. For |
| /// bullet list markers, it will be one of b'-', b'+', or b'*'. |
| pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> { |
| let save = self.clone(); |
| let indent = self.scan_space_upto(3); |
| if self.ix < self.bytes.len() { |
| let c = self.bytes[self.ix]; |
| if c == b'-' || c == b'+' || c == b'*' { |
| if self.ix >= self.min_hrule_offset { |
| // there could be an hrule here |
| if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) { |
| self.min_hrule_offset = min_offset; |
| } else { |
| *self = save; |
| return None; |
| } |
| } |
| self.ix += 1; |
| if self.scan_space(1) || self.is_at_eol() { |
| return self.finish_list_marker(c, 0, indent + 2); |
| } |
| } else if c >= b'0' && c <= b'9' { |
| let start_ix = self.ix; |
| let mut ix = self.ix + 1; |
| let mut val = u64::from(c - b'0'); |
| while ix < self.bytes.len() && ix - start_ix < 10 { |
| let c = self.bytes[ix]; |
| ix += 1; |
| if c >= b'0' && c <= b'9' { |
| val = val * 10 + u64::from(c - b'0'); |
| } else if c == b')' || c == b'.' { |
| self.ix = ix; |
| if self.scan_space(1) || self.is_at_eol() { |
| return self.finish_list_marker(c, val, indent + self.ix - start_ix); |
| } else { |
| break; |
| } |
| } else { |
| break; |
| } |
| } |
| } |
| } |
| *self = save; |
| None |
| } |
| |
| fn finish_list_marker( |
| &mut self, |
| c: u8, |
| start: u64, |
| mut indent: usize, |
| ) -> Option<(u8, u64, usize)> { |
| let save = self.clone(); |
| |
| // skip the rest of the line if it's blank |
| if scan_blank_line(&self.bytes[self.ix..]).is_some() { |
| return Some((c, start, indent)); |
| } |
| |
| let post_indent = self.scan_space_upto(4); |
| if post_indent < 4 { |
| indent += post_indent; |
| } else { |
| *self = save; |
| } |
| Some((c, start, indent)) |
| } |
| |
| /// Returns Some(is_checked) when a task list marker was found. Resets itself |
| /// to original state otherwise. |
| pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> { |
| let save = self.clone(); |
| self.scan_space_upto(3); |
| |
| if !self.scan_ch(b'[') { |
| *self = save; |
| return None; |
| } |
| let is_checked = match self.bytes.get(self.ix) { |
| Some(&c) if is_ascii_whitespace_no_nl(c) => { |
| self.ix += 1; |
| false |
| } |
| Some(b'x') | Some(b'X') => { |
| self.ix += 1; |
| true |
| } |
| _ => { |
| *self = save; |
| return None; |
| } |
| }; |
| if !self.scan_ch(b']') { |
| *self = save; |
| return None; |
| } |
| if !self |
| .bytes |
| .get(self.ix) |
| .map(|&b| is_ascii_whitespace_no_nl(b)) |
| .unwrap_or(false) |
| { |
| *self = save; |
| return None; |
| } |
| Some(is_checked) |
| } |
| |
| pub(crate) fn bytes_scanned(&self) -> usize { |
| self.ix |
| } |
| |
| pub(crate) fn remaining_space(&self) -> usize { |
| self.spaces_remaining |
| } |
| } |
| |
| pub(crate) fn is_ascii_whitespace(c: u8) -> bool { |
| (c >= 0x09 && c <= 0x0d) || c == b' ' |
| } |
| |
| pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool { |
| c == b'\t' || c == 0x0b || c == 0x0c || c == b' ' |
| } |
| |
| fn is_ascii_alpha(c: u8) -> bool { |
| match c { |
| b'a'..=b'z' | b'A'..=b'Z' => true, |
| _ => false, |
| } |
| } |
| |
| fn is_ascii_alphanumeric(c: u8) -> bool { |
| match c { |
| b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, |
| _ => false, |
| } |
| } |
| |
| fn is_ascii_letterdigitdash(c: u8) -> bool { |
| c == b'-' || is_ascii_alphanumeric(c) |
| } |
| |
| fn is_digit(c: u8) -> bool { |
| b'0' <= c && c <= b'9' |
| } |
| |
| fn is_valid_unquoted_attr_value_char(c: u8) -> bool { |
| match c { |
| b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false, |
| _ => true, |
| } |
| } |
| |
| // scan a single character |
| pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize { |
| if !data.is_empty() && data[0] == c { |
| 1 |
| } else { |
| 0 |
| } |
| } |
| |
| pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize |
| where |
| F: FnMut(u8) -> bool, |
| { |
| data.iter().take_while(|&&c| f(c)).count() |
| } |
| |
| pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize |
| where |
| F: FnMut(u8) -> bool, |
| { |
| data.iter().rev().take_while(|&&c| f(c)).count() |
| } |
| |
| pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize { |
| scan_while(data, |x| x == c) |
| } |
| |
| // Note: this scans ASCII whitespace only, for Unicode whitespace use |
| // a different function. |
| pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize { |
| scan_while(data, is_ascii_whitespace_no_nl) |
| } |
| |
| fn scan_attr_value_chars(data: &[u8]) -> usize { |
| scan_while(data, is_valid_unquoted_attr_value_char) |
| } |
| |
| pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> { |
| if bytes.is_empty() { |
| return Some(0); |
| } |
| match bytes[0] { |
| b'\n' => Some(1), |
| b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }), |
| _ => None, |
| } |
| } |
| |
| pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> { |
| let i = scan_whitespace_no_nl(bytes); |
| scan_eol(&bytes[i..]).map(|n| i + n) |
| } |
| |
| pub(crate) fn scan_nextline(bytes: &[u8]) -> usize { |
| memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1) |
| } |
| |
| // return: end byte for closing code fence, or None |
| // if the line is not a closing code fence |
| pub(crate) fn scan_closing_code_fence( |
| bytes: &[u8], |
| fence_char: u8, |
| n_fence_char: usize, |
| ) -> Option<usize> { |
| if bytes.is_empty() { |
| return Some(0); |
| } |
| let mut i = 0; |
| let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char); |
| if num_fence_chars_found < n_fence_char { |
| return None; |
| } |
| i += num_fence_chars_found; |
| let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' '); |
| i += num_trailing_spaces; |
| scan_eol(&bytes[i..]).map(|_| i) |
| } |
| |
| // returned pair is (number of bytes, number of spaces) |
| fn calc_indent(text: &[u8], max: usize) -> (usize, usize) { |
| let mut spaces = 0; |
| let mut offset = 0; |
| |
| for (i, &b) in text.iter().enumerate() { |
| match b { |
| b' ' => { |
| spaces += 1; |
| if spaces == max { |
| break; |
| } |
| } |
| b'\t' => { |
| let new_spaces = spaces + 4 - (spaces & 3); |
| if new_spaces > max { |
| break; |
| } |
| spaces = new_spaces; |
| } |
| _ => break, |
| } |
| offset = i; |
| } |
| |
| (offset, spaces) |
| } |
| |
| /// Scan hrule opening sequence. |
| /// |
| /// Returns Ok(x) when it finds an hrule, where x is the |
| /// size of line containing the hrule, including the trailing newline. |
| /// |
| /// Returns Err(x) when it does not find an hrule and x is |
| /// the offset in data before no hrule can appear. |
| pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> { |
| if bytes.len() < 3 { |
| return Err(0); |
| } |
| let c = bytes[0]; |
| if !(c == b'*' || c == b'-' || c == b'_') { |
| return Err(0); |
| } |
| let mut n = 0; |
| let mut i = 0; |
| |
| while i < bytes.len() { |
| match bytes[i] { |
| b'\n' | b'\r' => { |
| i += scan_eol(&bytes[i..]).unwrap_or(0); |
| break; |
| } |
| c2 if c2 == c => { |
| n += 1; |
| } |
| b' ' | b'\t' => (), |
| _ => return Err(i), |
| } |
| i += 1; |
| } |
| if n >= 3 { |
| Ok(i) |
| } else { |
| Err(i) |
| } |
| } |
| |
| /// Scan an ATX heading opening sequence. |
| /// |
| /// Returns number of bytes in prefix and level. |
| pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> { |
| let level = scan_ch_repeat(data, b'#'); |
| if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) { |
| Some(level) |
| } else { |
| None |
| } |
| } |
| |
| /// Scan a setext heading underline. |
| /// |
| /// Returns number of bytes in line (including trailing newline) and level. |
| pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> { |
| let c = *data.get(0)?; |
| if !(c == b'-' || c == b'=') { |
| return None; |
| } |
| let mut i = 1 + scan_ch_repeat(&data[1..], c); |
| i += scan_blank_line(&data[i..])?; |
| let level = if c == b'=' { 1 } else { 2 }; |
| Some((i, level)) |
| } |
| |
| // returns number of bytes in line (including trailing |
| // newline) and column alignments |
| pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) { |
| let (mut i, spaces) = calc_indent(data, 4); |
| if spaces > 3 || i == data.len() { |
| return (0, vec![]); |
| } |
| let mut cols = vec![]; |
| let mut active_col = Alignment::None; |
| let mut start_col = true; |
| if data[i] == b'|' { |
| i += 1; |
| } |
| for c in &data[i..] { |
| if let Some(n) = scan_eol(&data[i..]) { |
| i += n; |
| break; |
| } |
| match *c { |
| b' ' => (), |
| b':' => { |
| active_col = match (start_col, active_col) { |
| (true, Alignment::None) => Alignment::Left, |
| (false, Alignment::Left) => Alignment::Center, |
| (false, Alignment::None) => Alignment::Right, |
| _ => active_col, |
| }; |
| start_col = false; |
| } |
| b'-' => { |
| start_col = false; |
| } |
| b'|' => { |
| start_col = true; |
| cols.push(active_col); |
| active_col = Alignment::None; |
| } |
| _ => { |
| cols = vec![]; |
| start_col = true; |
| break; |
| } |
| } |
| i += 1; |
| } |
| |
| if !start_col { |
| cols.push(active_col); |
| } |
| |
| (i, cols) |
| } |
| |
| /// Scan code fence. |
| /// |
| /// Returns number of bytes scanned and the char that is repeated to make the code fence. |
| pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> { |
| let c = *data.get(0)?; |
| if !(c == b'`' || c == b'~') { |
| return None; |
| } |
| let i = 1 + scan_ch_repeat(&data[1..], c); |
| if i >= 3 { |
| if c == b'`' { |
| let suffix = &data[i..]; |
| let next_line = i + scan_nextline(suffix); |
| // FIXME: make sure this is correct |
| if suffix[..(next_line - i)].iter().any(|&b| b == b'`') { |
| return None; |
| } |
| } |
| Some((i, c)) |
| } else { |
| None |
| } |
| } |
| |
| pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> { |
| if data.starts_with(b"> ") { |
| Some(2) |
| } else { |
| None |
| } |
| } |
| |
| /// This already assumes the list item has been scanned. |
| pub(crate) fn scan_empty_list(data: &[u8]) -> bool { |
| let mut ix = 0; |
| for _ in 0..2 { |
| if let Some(bytes) = scan_blank_line(&data[ix..]) { |
| ix += bytes; |
| } else { |
| return false; |
| } |
| } |
| true |
| } |
| |
| // return number of bytes scanned, delimiter, start index, and indent |
| pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> { |
| let mut c = *bytes.get(0)?; |
| let (w, start) = match c { |
| b'-' | b'+' | b'*' => (1, 0), |
| b'0'..=b'9' => { |
| let (length, start) = parse_decimal(bytes); |
| c = *bytes.get(length)?; |
| if !(c == b'.' || c == b')') { |
| return None; |
| } |
| (length + 1, start) |
| } |
| _ => { |
| return None; |
| } |
| }; |
| // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness |
| let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5); |
| if postindent == 0 { |
| scan_eol(&bytes[w..])?; |
| postindent += 1; |
| } else if postindent > 4 { |
| postn = 1; |
| postindent = 1; |
| } |
| if scan_blank_line(&bytes[w..]).is_some() { |
| postn = 0; |
| postindent = 1; |
| } |
| Some((w + postn, c, start, w + postindent)) |
| } |
| |
| // returns (number of bytes, parsed decimal) |
| fn parse_decimal(bytes: &[u8]) -> (usize, usize) { |
| match bytes |
| .iter() |
| .take_while(|&&b| is_digit(b)) |
| .try_fold((0, 0usize), |(count, acc), c| { |
| let digit = usize::from(c - b'0'); |
| match acc |
| .checked_mul(10) |
| .and_then(|ten_acc| ten_acc.checked_add(digit)) |
| { |
| Some(number) => Ok((count + 1, number)), |
| // stop early on overflow |
| None => Err((count, acc)), |
| } |
| }) { |
| Ok(p) | Err(p) => p, |
| } |
| } |
| |
| // returns (number of bytes, parsed hex) |
| fn parse_hex(bytes: &[u8]) -> (usize, usize) { |
| match bytes.iter().try_fold((0, 0usize), |(count, acc), c| { |
| let mut c = *c; |
| let digit = if c >= b'0' && c <= b'9' { |
| usize::from(c - b'0') |
| } else { |
| // make lower case |
| c |= 0x20; |
| if c >= b'a' && c <= b'f' { |
| usize::from(c - b'a' + 10) |
| } else { |
| return Err((count, acc)); |
| } |
| }; |
| match acc |
| .checked_mul(16) |
| .and_then(|sixteen_acc| sixteen_acc.checked_add(digit)) |
| { |
| Some(number) => Ok((count + 1, number)), |
| // stop early on overflow |
| None => Err((count, acc)), |
| } |
| }) { |
| Ok(p) | Err(p) => p, |
| } |
| } |
| |
| fn char_from_codepoint(input: usize) -> Option<char> { |
| let mut codepoint = input.try_into().ok()?; |
| if codepoint == 0 { |
| codepoint = 0xFFFD; |
| } |
| char::from_u32(codepoint) |
| } |
| |
| // doesn't bother to check data[0] == '&' |
| pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) { |
| let mut end = 1; |
| if scan_ch(&bytes[end..], b'#') == 1 { |
| end += 1; |
| let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' { |
| end += 1; |
| parse_hex(&bytes[end..]) |
| } else { |
| parse_decimal(&bytes[end..]) |
| }; |
| end += bytecount; |
| return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 { |
| (0, None) |
| } else if let Some(c) = char_from_codepoint(codepoint) { |
| (end + 1, Some(c.into())) |
| } else { |
| (0, None) |
| }; |
| } |
| end += scan_while(&bytes[end..], is_ascii_alphanumeric); |
| if scan_ch(&bytes[end..], b';') == 1 { |
| if let Some(value) = entities::get_entity(&bytes[1..end]) { |
| return (end + 1, Some(value.into())); |
| } |
| } |
| (0, None) |
| } |
| |
| // FIXME: we can most likely re-use other scanners |
| // returns (bytelength, title_str) |
| pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> { |
| let mut chars = text.chars().peekable(); |
| let closing_delim = match chars.next()? { |
| '\'' => '\'', |
| '"' => '"', |
| '(' => ')', |
| _ => return None, |
| }; |
| let mut bytecount = 1; |
| |
| while let Some(c) = chars.next() { |
| match c { |
| '\n' => { |
| bytecount += 1; |
| let mut next = *chars.peek()?; |
| while is_ascii_whitespace_no_nl(next as u8) { |
| bytecount += chars.next()?.len_utf8(); |
| next = *chars.peek()?; |
| } |
| if *chars.peek()? == '\n' { |
| // blank line - not allowed |
| return None; |
| } |
| } |
| '\\' => { |
| let next_char = chars.next()?; |
| bytecount += 1 + next_char.len_utf8(); |
| } |
| c if c == closing_delim => { |
| return Some((bytecount + 1, &text[1..bytecount])); |
| } |
| c => { |
| bytecount += c.len_utf8(); |
| } |
| } |
| } |
| None |
| } |
| |
| // note: dest returned is raw, still needs to be unescaped |
| // TODO: check that nested parens are really not allowed for refdefs |
| // TODO(performance): this func should probably its own unescaping |
| pub(crate) fn scan_link_dest( |
| data: &str, |
| start_ix: usize, |
| max_next: usize, |
| ) -> Option<(usize, &str)> { |
| let bytes = &data.as_bytes()[start_ix..]; |
| let mut i = scan_ch(bytes, b'<'); |
| |
| if i != 0 { |
| // pointy links |
| while i < bytes.len() { |
| match bytes[i] { |
| b'\n' | b'\r' | b'<' => return None, |
| b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])), |
| b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => { |
| i += 1; |
| } |
| _ => {} |
| } |
| i += 1; |
| } |
| None |
| } else { |
| // non-pointy links |
| let mut nest = 0; |
| while i < bytes.len() { |
| match bytes[i] { |
| 0x0..=0x20 => { |
| break; |
| } |
| b'(' => { |
| if nest > max_next { |
| return None; |
| } |
| nest += 1; |
| } |
| b')' => { |
| if nest == 0 { |
| break; |
| } |
| nest -= 1; |
| } |
| b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => { |
| i += 1; |
| } |
| _ => {} |
| } |
| i += 1; |
| } |
| Some((i, &data[start_ix..(start_ix + i)])) |
| } |
| } |
| |
| /// Returns bytes scanned |
| fn scan_attribute_name(data: &[u8]) -> Option<usize> { |
| let (&c, tail) = data.split_first()?; |
| if is_ascii_alpha(c) || c == b'_' || c == b':' { |
| Some( |
| 1 + scan_while(tail, |c| { |
| is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-' |
| }), |
| ) |
| } else { |
| None |
| } |
| } |
| |
| /// Returns byte scanned (TODO: should it return new offset?) |
| fn scan_attribute(data: &[u8], allow_newline: bool) -> Option<usize> { |
| let whitespace_scanner = |
| |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r'); |
| let mut ix = scan_attribute_name(data)?; |
| let n_whitespace = scan_while(&data[ix..], whitespace_scanner); |
| ix += n_whitespace; |
| if scan_ch(&data[ix..], b'=') == 1 { |
| ix += 1; |
| ix += scan_while(&data[ix..], whitespace_scanner); |
| ix += scan_attribute_value(&data[ix..], allow_newline)?; |
| } else if n_whitespace > 0 { |
| // Leave whitespace for next attribute. |
| ix -= 1; |
| } |
| Some(ix) |
| } |
| |
| fn scan_attribute_value(data: &[u8], allow_newline: bool) -> Option<usize> { |
| let mut i = 0; |
| match *data.get(0)? { |
| b @ b'"' | b @ b'\'' => { |
| i += 1; |
| i += scan_while(&data[i..], |c| { |
| c != b && (allow_newline || c != b'\n' && c != b'\r') |
| }); |
| if scan_ch(&data[i..], b) == 0 { |
| return None; |
| } |
| i += 1; |
| } |
| b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => { |
| return None; |
| } |
| _ => { |
| // unquoted attribute value |
| i += scan_attr_value_chars(&data[i..]); |
| } |
| } |
| Some(i) |
| } |
| |
| // Remove backslash escapes and resolve entities |
| pub(crate) fn unescape(input: &str) -> CowStr<'_> { |
| let mut result = String::new(); |
| let mut mark = 0; |
| let mut i = 0; |
| let bytes = input.as_bytes(); |
| while i < bytes.len() { |
| match bytes[i] { |
| b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => { |
| result.push_str(&input[mark..i]); |
| mark = i + 1; |
| i += 2; |
| } |
| b'&' => match scan_entity(&bytes[i..]) { |
| (n, Some(value)) => { |
| result.push_str(&input[mark..i]); |
| result.push_str(&value); |
| i += n; |
| mark = i; |
| } |
| _ => i += 1, |
| }, |
| b'\r' => { |
| result.push_str(&input[mark..i]); |
| i += 1; |
| mark = i; |
| } |
| _ => i += 1, |
| } |
| } |
| if mark == 0 { |
| input.into() |
| } else { |
| result.push_str(&input[mark..]); |
| result.into() |
| } |
| } |
| |
| /// Assumes `data` is preceded by `<`. |
| pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) { |
| let i = scan_ch(data, b'/'); |
| let n = scan_while(&data[i..], is_ascii_alphanumeric); |
| // TODO: scan attributes and > |
| (i + n, &data[i..i + n]) |
| } |
| |
| pub(crate) fn is_html_tag(tag: &[u8]) -> bool { |
| HTML_TAGS |
| .binary_search_by(|probe| { |
| let probe_bytes_iter = probe.as_bytes().iter(); |
| let tag_bytes_iter = tag.iter(); |
| |
| probe_bytes_iter |
| .zip(tag_bytes_iter) |
| .find_map(|(&a, &b)| { |
| // We can compare case insensitively because the probes are |
| // all lower case alpha strings. |
| match a.cmp(&(b | 0x20)) { |
| std::cmp::Ordering::Equal => None, |
| inequality => Some(inequality), |
| } |
| }) |
| .unwrap_or_else(|| probe.len().cmp(&tag.len())) |
| }) |
| .is_ok() |
| } |
| |
| /// Assumes that `data` is preceded by `<`. |
| pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> { |
| let i = scan_html_block_inner(data, false)?; |
| scan_blank_line(&data[i..])?; |
| Some(i) |
| } |
| |
| fn scan_html_block_inner(data: &[u8], allow_newline: bool) -> Option<usize> { |
| let close_tag_bytes = scan_ch(&data, b'/'); |
| let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha); |
| if l == 0 { |
| return None; |
| } |
| let mut i = close_tag_bytes + l; |
| i += scan_while(&data[i..], is_ascii_letterdigitdash); |
| |
| if close_tag_bytes == 0 { |
| loop { |
| let whitespace_scanner = |
| |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r'); |
| let whitespace = scan_while(&data[i..], whitespace_scanner); |
| i += whitespace; |
| if let Some(b'/') | Some(b'>') = data.get(i) { |
| break; |
| } |
| if whitespace == 0 { |
| return None; |
| } |
| i += scan_attribute(&data[i..], allow_newline)?; |
| } |
| } |
| |
| i += scan_whitespace_no_nl(&data[i..]); |
| |
| if close_tag_bytes == 0 { |
| i += scan_ch(&data[i..], b'/'); |
| } |
| |
| let c = scan_ch(&data[i..], b'>'); |
| if c == 0 { |
| None |
| } else { |
| Some(i + c) |
| } |
| } |
| |
| /// Returns (next_byte_offset, uri, type) |
| pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> { |
| scan_uri(text, start_ix) |
| .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink)) |
| .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email))) |
| } |
| |
| /// Returns (next_byte_offset, uri) |
| fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> { |
| let bytes = &text.as_bytes()[start_ix..]; |
| |
| // scheme's first byte must be an ascii letter |
| if bytes.is_empty() || !is_ascii_alpha(bytes[0]) { |
| return None; |
| } |
| |
| let mut i = 1; |
| |
| while i < bytes.len() { |
| let c = bytes[i]; |
| i += 1; |
| match c { |
| c if is_ascii_alphanumeric(c) => (), |
| b'.' | b'-' | b'+' => (), |
| b':' => break, |
| _ => return None, |
| } |
| } |
| |
| // scheme length must be between 2 and 32 characters long. scheme |
| // must be followed by colon |
| if i < 3 || i > 33 { |
| return None; |
| } |
| |
| while i < bytes.len() { |
| match bytes[i] { |
| b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())), |
| b'\0'..=b' ' | b'<' => return None, |
| _ => (), |
| } |
| i += 1; |
| } |
| |
| None |
| } |
| |
| /// Returns (next_byte_offset, email) |
| fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> { |
| // using a regex library would be convenient, but doing it by hand is not too bad |
| let bytes = &text.as_bytes()[start_ix..]; |
| let mut i = 0; |
| |
| while i < bytes.len() { |
| let c = bytes[i]; |
| i += 1; |
| match c { |
| c if is_ascii_alphanumeric(c) => (), |
| b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?' |
| | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (), |
| b'@' => break, |
| _ => return None, |
| } |
| } |
| |
| loop { |
| let label_start_ix = i; |
| let mut fresh_label = true; |
| |
| while i < bytes.len() { |
| match bytes[i] { |
| c if is_ascii_alphanumeric(c) => (), |
| b'-' if fresh_label => { |
| return None; |
| } |
| b'-' => (), |
| _ => break, |
| } |
| fresh_label = false; |
| i += 1; |
| } |
| |
| if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' { |
| return None; |
| } |
| |
| if scan_ch(&bytes[i..], b'.') == 0 { |
| break; |
| } |
| i += 1; |
| } |
| |
| if scan_ch(&bytes[i..], b'>') == 0 { |
| return None; |
| } |
| |
| Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())) |
| } |
| |
| /// Scan comment, declaration, or CDATA section, with initial "<!" already consumed. |
| /// Returns byte offset on match. |
| fn scan_inline_html_comment( |
| bytes: &[u8], |
| mut ix: usize, |
| scan_guard: &mut HtmlScanGuard, |
| ) -> Option<usize> { |
| let c = *bytes.get(ix)?; |
| ix += 1; |
| match c { |
| b'-' => { |
| let dashes = scan_ch_repeat(&bytes[ix..], b'-'); |
| if dashes < 1 { |
| return None; |
| } |
| // Saw "<!--", scan comment. |
| ix += dashes; |
| if scan_ch(&bytes[ix..], b'>') == 1 { |
| return None; |
| } |
| |
| while let Some(x) = memchr(b'-', &bytes[ix..]) { |
| ix += x + 1; |
| if scan_ch(&bytes[ix..], b'-') == 1 { |
| ix += 1; |
| return if scan_ch(&bytes[ix..], b'>') == 1 { |
| Some(ix + 1) |
| } else { |
| None |
| }; |
| } |
| } |
| None |
| } |
| b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => { |
| ix += b"CDATA[".len(); |
| ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x); |
| let close_brackets = scan_ch_repeat(&bytes[ix..], b']'); |
| ix += close_brackets; |
| |
| if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 { |
| scan_guard.cdata = ix; |
| None |
| } else { |
| Some(ix + 1) |
| } |
| } |
| b'A'..=b'Z' if ix > scan_guard.declaration => { |
| // Scan declaration. |
| ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z'); |
| let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace); |
| if whitespace == 0 { |
| return None; |
| } |
| ix += whitespace; |
| ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x); |
| if scan_ch(&bytes[ix..], b'>') == 0 { |
| scan_guard.declaration = ix; |
| None |
| } else { |
| Some(ix + 1) |
| } |
| } |
| _ => None, |
| } |
| } |
| |
| /// Scan processing directive, with initial "<?" already consumed. |
| /// Returns the next byte offset on success. |
| fn scan_inline_html_processing( |
| bytes: &[u8], |
| mut ix: usize, |
| scan_guard: &mut HtmlScanGuard, |
| ) -> Option<usize> { |
| if ix <= scan_guard.processing { |
| return None; |
| } |
| while let Some(offset) = memchr(b'?', &bytes[ix..]) { |
| ix += offset + 1; |
| if scan_ch(&bytes[ix..], b'>') == 1 { |
| return Some(ix + 1); |
| } |
| } |
| scan_guard.processing = ix; |
| None |
| } |
| |
| /// Returns the next byte offset on success. |
| pub(crate) fn scan_inline_html( |
| bytes: &[u8], |
| ix: usize, |
| scan_guard: &mut HtmlScanGuard, |
| ) -> Option<usize> { |
| let c = *bytes.get(ix)?; |
| if c == b'!' { |
| scan_inline_html_comment(bytes, ix + 1, scan_guard) |
| } else if c == b'?' { |
| scan_inline_html_processing(bytes, ix + 1, scan_guard) |
| } else { |
| let i = scan_html_block_inner(&bytes[ix..], true)?; |
| Some(i + ix) |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::*; |
| #[test] |
| fn overflow_list() { |
| assert!( |
| scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none() |
| ); |
| } |
| |
| #[test] |
| fn overflow_by_addition() { |
| assert!(scan_listitem(b"1844674407370955161615!").is_none()); |
| } |
| } |