| use normalize::{ |
| hangul_decomposition_length, |
| is_hangul_syllable, |
| }; |
| use tables; |
| |
| pub(crate) const MAX_NONSTARTERS: usize = 30; |
| const COMBINING_GRAPHEME_JOINER: char = '\u{034F}'; |
| |
| /// UAX15-D4: This iterator keeps track of how many non-starters there have been |
| /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner |
| /// (U+034F) if the count exceeds 30. |
| pub struct StreamSafe<I> { |
| iter: I, |
| nonstarter_count: usize, |
| buffer: Option<char>, |
| } |
| |
| impl<I> StreamSafe<I> { |
| pub(crate) fn new(iter: I) -> Self { |
| Self { iter, nonstarter_count: 0, buffer: None } |
| } |
| } |
| |
| impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> { |
| type Item = char; |
| |
| #[inline] |
| fn next(&mut self) -> Option<char> { |
| if let Some(ch) = self.buffer.take() { |
| return Some(ch); |
| } |
| let next_ch = match self.iter.next() { |
| None => return None, |
| Some(c) => c, |
| }; |
| let d = classify_nonstarters(next_ch); |
| if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS { |
| self.buffer = Some(next_ch); |
| self.nonstarter_count = 0; |
| return Some(COMBINING_GRAPHEME_JOINER); |
| } |
| |
| // No starters in the decomposition, so keep accumulating |
| if d.leading_nonstarters == d.decomposition_len { |
| self.nonstarter_count += d.decomposition_len; |
| } |
| // Otherwise, restart the nonstarter counter. |
| else { |
| self.nonstarter_count = d.trailing_nonstarters; |
| } |
| Some(next_ch) |
| } |
| } |
| |
| #[derive(Debug)] |
| pub(crate) struct Decomposition { |
| pub(crate) leading_nonstarters: usize, |
| pub(crate) trailing_nonstarters: usize, |
| pub(crate) decomposition_len: usize, |
| } |
| |
| #[inline] |
| pub(crate) fn classify_nonstarters(c: char) -> Decomposition { |
| // As usual, fast path for ASCII (which is always a starter) |
| if c <= '\x7f' { |
| return Decomposition { |
| leading_nonstarters: 0, |
| trailing_nonstarters: 0, |
| decomposition_len: 1, |
| } |
| } |
| // Next, special case Hangul, since it's not handled by our tables. |
| if is_hangul_syllable(c) { |
| return Decomposition { |
| leading_nonstarters: 0, |
| trailing_nonstarters: 0, |
| decomposition_len: hangul_decomposition_length(c), |
| }; |
| } |
| let decomp = tables::compatibility_fully_decomposed(c) |
| .or_else(|| tables::canonical_fully_decomposed(c)); |
| match decomp { |
| Some(decomp) => { |
| Decomposition { |
| leading_nonstarters: tables::stream_safe_leading_nonstarters(c), |
| trailing_nonstarters: tables::stream_safe_trailing_nonstarters(c), |
| decomposition_len: decomp.len(), |
| } |
| }, |
| None => { |
| let is_nonstarter = tables::canonical_combining_class(c) != 0; |
| let nonstarter = if is_nonstarter { 1 } else { 0 }; |
| Decomposition { |
| leading_nonstarters: nonstarter, |
| trailing_nonstarters: nonstarter, |
| decomposition_len: 1, |
| } |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::{ |
| StreamSafe, |
| classify_nonstarters, |
| }; |
| use std::char; |
| use normalization_tests::NORMALIZATION_TESTS; |
| use normalize::decompose_compatible; |
| use tables; |
| |
| fn stream_safe(s: &str) -> String { |
| StreamSafe::new(s.chars()).collect() |
| } |
| |
| #[test] |
| fn test_normalization_tests_unaffected() { |
| for test in NORMALIZATION_TESTS { |
| for &s in &[test.source, test.nfc, test.nfd, test.nfkc, test.nfkd] { |
| assert_eq!(stream_safe(s), s); |
| } |
| } |
| } |
| |
| #[test] |
| fn test_simple() { |
| let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone"; |
| assert_eq!(stream_safe(technically_okay), technically_okay); |
| |
| let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone"; |
| assert_ne!(stream_safe(too_much), too_much); |
| } |
| |
| #[test] |
| fn test_classify_nonstarters() { |
| // Highest character in the `compat_fully_decomp` table is 2FA1D |
| for ch in 0..0x2FA1E { |
| let ch = match char::from_u32(ch) { |
| Some(c) => c, |
| None => continue, |
| }; |
| let c = classify_nonstarters(ch); |
| let mut s = vec![]; |
| decompose_compatible(ch, |c| s.push(c)); |
| |
| assert_eq!(s.len(), c.decomposition_len); |
| |
| let num_leading = s |
| .iter() |
| .take_while(|&c| tables::canonical_combining_class(*c) != 0) |
| .count(); |
| let num_trailing = s |
| .iter() |
| .rev() |
| .take_while(|&c| tables::canonical_combining_class(*c) != 0) |
| .count(); |
| |
| assert_eq!(num_leading, c.leading_nonstarters); |
| assert_eq!(num_trailing, c.trailing_nonstarters); |
| } |
| } |
| } |