| // Copyright 2018 Google LLC |
| // |
| // Permission is hereby granted, free of charge, to any person obtaining a copy |
| // of this software and associated documentation files (the "Software"), to deal |
| // in the Software without restriction, including without limitation the rights |
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| // copies of the Software, and to permit persons to whom the Software is |
| // furnished to do so, subject to the following conditions: |
| // |
| // The above copyright notice and this permission notice shall be included in |
| // all copies or substantial portions of the Software. |
| // |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| // THE SOFTWARE. |
| |
| //! Link label parsing and matching. |
| |
| use unicase::UniCase; |
| |
| use crate::scanners::{is_ascii_whitespace, scan_eol}; |
| use crate::strings::CowStr; |
| |
| pub enum ReferenceLabel<'a> { |
| Link(CowStr<'a>), |
| Footnote(CowStr<'a>), |
| } |
| |
| pub type LinkLabel<'a> = UniCase<CowStr<'a>>; |
| |
| /// Assumes the opening bracket has already been scanned. |
| /// Returns the number of bytes read (including closing bracket) and label on success. |
| pub(crate) fn scan_link_label_rest<F>( |
| text: &str, |
| start: usize, |
| linebreak_handler: F, |
| ) -> Option<(usize, CowStr<'_>)> |
| where |
| // Takes a _global_ index into the document string, returns how many bytes to skip |
| F: Fn(usize) -> Option<usize>, |
| { |
| let bytes = text.as_bytes(); |
| let mut ix = start; |
| let mut only_white_space = true; |
| let mut codepoints = 0; |
| // no worries, doesnt allocate until we push things onto it |
| let mut label = String::new(); |
| let mut mark = start; |
| |
| loop { |
| if codepoints >= 1000 { |
| return None; |
| } |
| match *bytes.get(ix)? { |
| b'[' => return None, |
| b']' => break, |
| b'\\' => { |
| ix += 2; |
| codepoints += 2; |
| only_white_space = false; |
| } |
| b if is_ascii_whitespace(b) => { |
| // normalize labels by collapsing whitespaces, including linebreaks |
| let mut whitespaces = 0; |
| let mut linebreaks = 0; |
| let whitespace_start = ix; |
| |
| while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) { |
| if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
| linebreaks += 1; |
| if linebreaks > 1 { |
| return None; |
| } |
| ix += eol_bytes; |
| ix += linebreak_handler(ix)?; |
| whitespaces += 2; // indicate that we need to replace |
| } else { |
| whitespaces += if bytes[ix] == b' ' { 1 } else { 2 }; |
| ix += 1; |
| } |
| } |
| if whitespaces > 1 { |
| label.push_str(&text[mark..whitespace_start]); |
| label.push(' '); |
| mark = ix; |
| codepoints += ix - whitespace_start; |
| } else { |
| codepoints += 1; |
| } |
| } |
| b => { |
| only_white_space = false; |
| ix += 1; |
| if b & 0b1000_0000 != 0 { |
| codepoints += 1; |
| } |
| } |
| } |
| } |
| |
| if only_white_space { |
| None |
| } else { |
| let cow = if mark == 0 { |
| text[start..ix].into() |
| } else { |
| label.push_str(&text[mark..ix]); |
| label.into() |
| }; |
| Some((ix + 1 - start, cow)) |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::scan_link_label_rest; |
| |
| #[test] |
| fn whitespace_normalization() { |
| let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]"; |
| let expected_output = "« Blurry Eyes »"; // regular spaces! |
| |
| let (_bytes, normalized_label) = scan_link_label_rest(input, 0, |_| None).unwrap(); |
| assert_eq!(expected_output, normalized_label.as_ref()); |
| } |
| |
| #[test] |
| fn return_carriage_linefeed_ok() { |
| let input = "hello\r\nworld\r\n]"; |
| assert!(scan_link_label_rest(input, 0, |_| Some(0)).is_some()); |
| } |
| } |