| use std::cmp; |
| use std::fmt; |
| use std::panic::{RefUnwindSafe, UnwindSafe}; |
| use std::u8; |
| |
| use memchr::{memchr, memchr2, memchr3}; |
| |
| use crate::ahocorasick::MatchKind; |
| use crate::packed; |
| use crate::Match; |
| |
| /// A candidate is the result of running a prefilter on a haystack at a |
| /// particular position. The result is either no match, a confirmed match or |
| /// a possible match. |
| /// |
| /// When no match is returned, the prefilter is guaranteeing that no possible |
| /// match can be found in the haystack, and the caller may trust this. That is, |
| /// all correct prefilters must never report false negatives. |
| /// |
| /// In some cases, a prefilter can confirm a match very quickly, in which case, |
| /// the caller may use this to stop what it's doing and report the match. In |
| /// this case, prefilter implementations must never report a false positive. |
| /// In other cases, the prefilter can only report a potential match, in which |
| /// case the callers must attempt to confirm the match. In this case, prefilter |
| /// implementations are permitted to return false positives. |
| #[derive(Clone, Debug)] |
| pub enum Candidate { |
| None, |
| Match(Match), |
| PossibleStartOfMatch(usize), |
| } |
| |
| impl Candidate { |
| /// Convert this candidate into an option. This is useful when callers |
| /// do not distinguish between true positives and false positives (i.e., |
| /// the caller must always confirm the match in order to update some other |
| /// state). |
| pub fn into_option(self) -> Option<usize> { |
| match self { |
| Candidate::None => None, |
| Candidate::Match(ref m) => Some(m.start()), |
| Candidate::PossibleStartOfMatch(start) => Some(start), |
| } |
| } |
| } |
| |
| /// A prefilter describes the behavior of fast literal scanners for quickly |
| /// skipping past bytes in the haystack that we know cannot possibly |
| /// participate in a match. |
| pub trait Prefilter: |
| Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug |
| { |
| /// Returns the next possible match candidate. This may yield false |
| /// positives, so callers must confirm a match starting at the position |
| /// returned. This, however, must never produce false negatives. That is, |
| /// this must, at minimum, return the starting position of the next match |
| /// in the given haystack after or at the given position. |
| fn next_candidate( |
| &self, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate; |
| |
| /// A method for cloning a prefilter, to work-around the fact that Clone |
| /// is not object-safe. |
| fn clone_prefilter(&self) -> Box<dyn Prefilter>; |
| |
| /// Returns the approximate total amount of heap used by this prefilter, in |
| /// units of bytes. |
| fn heap_bytes(&self) -> usize; |
| |
| /// Returns true if and only if this prefilter never returns false |
| /// positives. This is useful for completely avoiding the automaton |
| /// when the prefilter can quickly confirm its own matches. |
| /// |
| /// By default, this returns true, which is conservative; it is always |
| /// correct to return `true`. Returning `false` here and reporting a false |
| /// positive will result in incorrect searches. |
| fn reports_false_positives(&self) -> bool { |
| true |
| } |
| |
| /// Returns true if and only if this prefilter may look for a non-starting |
| /// position of a match. |
| /// |
| /// This is useful in a streaming context where prefilters that don't look |
| /// for a starting position of a match can be quite difficult to deal with. |
| /// |
| /// This returns false by default. |
| fn looks_for_non_start_of_match(&self) -> bool { |
| false |
| } |
| } |
| |
| impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { |
| #[inline] |
| fn next_candidate( |
| &self, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| (**self).next_candidate(state, haystack, at) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| (**self).clone_prefilter() |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| (**self).heap_bytes() |
| } |
| |
| fn reports_false_positives(&self) -> bool { |
| (**self).reports_false_positives() |
| } |
| } |
| |
| /// A convenience object for representing any type that implements Prefilter |
| /// and is cloneable. |
| #[derive(Debug)] |
| pub struct PrefilterObj(Box<dyn Prefilter>); |
| |
| impl Clone for PrefilterObj { |
| fn clone(&self) -> Self { |
| PrefilterObj(self.0.clone_prefilter()) |
| } |
| } |
| |
| impl PrefilterObj { |
| /// Create a new prefilter object. |
| pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj { |
| PrefilterObj(Box::new(t)) |
| } |
| |
| /// Return the underlying prefilter trait object. |
| pub fn as_ref(&self) -> &dyn Prefilter { |
| &*self.0 |
| } |
| } |
| |
| /// PrefilterState tracks state associated with the effectiveness of a |
| /// prefilter. It is used to track how many bytes, on average, are skipped by |
| /// the prefilter. If this average dips below a certain threshold over time, |
| /// then the state renders the prefilter inert and stops using it. |
| /// |
| /// A prefilter state should be created for each search. (Where creating an |
| /// iterator via, e.g., `find_iter`, is treated as a single search.) |
| #[derive(Clone, Debug)] |
| pub struct PrefilterState { |
| /// The number of skips that has been executed. |
| skips: usize, |
| /// The total number of bytes that have been skipped. |
| skipped: usize, |
| /// The maximum length of a match. This is used to help determine how many |
| /// bytes on average should be skipped in order for a prefilter to be |
| /// effective. |
| max_match_len: usize, |
| /// Once this heuristic has been deemed permanently ineffective, it will be |
| /// inert throughout the rest of its lifetime. This serves as a cheap way |
| /// to check inertness. |
| inert: bool, |
| /// The last (absolute) position at which a prefilter scanned to. |
| /// Prefilters can use this position to determine whether to re-scan or |
| /// not. |
| /// |
| /// Unlike other things that impact effectiveness, this is a fleeting |
| /// condition. That is, a prefilter can be considered ineffective if it is |
| /// at a position before `last_scan_at`, but can become effective again |
| /// once the search moves past `last_scan_at`. |
| /// |
| /// The utility of this is to both avoid additional overhead from calling |
| /// the prefilter and to avoid quadratic behavior. This ensures that a |
| /// prefilter will scan any particular byte at most once. (Note that some |
| /// prefilters, like the start-byte prefilter, do not need to use this |
| /// field at all, since it only looks for starting bytes.) |
| last_scan_at: usize, |
| } |
| |
| impl PrefilterState { |
| /// The minimum number of skip attempts to try before considering whether |
| /// a prefilter is effective or not. |
| const MIN_SKIPS: usize = 40; |
| |
| /// The minimum amount of bytes that skipping must average, expressed as a |
| /// factor of the multiple of the length of a possible match. |
| /// |
| /// That is, after MIN_SKIPS have occurred, if the average number of bytes |
| /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the |
| /// prefilter outed to be rendered inert. |
| const MIN_AVG_FACTOR: usize = 2; |
| |
| /// Create a fresh prefilter state. |
| pub fn new(max_match_len: usize) -> PrefilterState { |
| PrefilterState { |
| skips: 0, |
| skipped: 0, |
| max_match_len, |
| inert: false, |
| last_scan_at: 0, |
| } |
| } |
| |
| /// Create a prefilter state that always disables the prefilter. |
| pub fn disabled() -> PrefilterState { |
| PrefilterState { |
| skips: 0, |
| skipped: 0, |
| max_match_len: 0, |
| inert: true, |
| last_scan_at: 0, |
| } |
| } |
| |
| /// Update this state with the number of bytes skipped on the last |
| /// invocation of the prefilter. |
| #[inline] |
| fn update_skipped_bytes(&mut self, skipped: usize) { |
| self.skips += 1; |
| self.skipped += skipped; |
| } |
| |
| /// Updates the position at which the last scan stopped. This may be |
| /// greater than the position of the last candidate reported. For example, |
| /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz` |
| /// will report a candidate at position `0`, but the end of its last scan |
| /// will be at position `3`. |
| /// |
| /// This position factors into the effectiveness of this prefilter. If the |
| /// current position is less than the last position at which a scan ended, |
| /// then the prefilter should not be re-run until the search moves past |
| /// that position. |
| #[inline] |
| fn update_at(&mut self, at: usize) { |
| if at > self.last_scan_at { |
| self.last_scan_at = at; |
| } |
| } |
| |
| /// Return true if and only if this state indicates that a prefilter is |
| /// still effective. |
| /// |
| /// The given pos should correspond to the current starting position of the |
| /// search. |
| #[inline] |
| pub fn is_effective(&mut self, at: usize) -> bool { |
| if self.inert { |
| return false; |
| } |
| if at < self.last_scan_at { |
| return false; |
| } |
| if self.skips < PrefilterState::MIN_SKIPS { |
| return true; |
| } |
| |
| let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len; |
| if self.skipped >= min_avg * self.skips { |
| return true; |
| } |
| |
| // We're inert. |
| self.inert = true; |
| false |
| } |
| } |
| |
| /// A builder for constructing the best possible prefilter. When constructed, |
| /// this builder will heuristically select the best prefilter it can build, |
| /// if any, and discard the rest. |
| #[derive(Debug)] |
| pub struct Builder { |
| count: usize, |
| ascii_case_insensitive: bool, |
| start_bytes: StartBytesBuilder, |
| rare_bytes: RareBytesBuilder, |
| packed: Option<packed::Builder>, |
| } |
| |
| impl Builder { |
| /// Create a new builder for constructing the best possible prefilter. |
| pub fn new(kind: MatchKind) -> Builder { |
| let pbuilder = kind |
| .as_packed() |
| .map(|kind| packed::Config::new().match_kind(kind).builder()); |
| Builder { |
| count: 0, |
| ascii_case_insensitive: false, |
| start_bytes: StartBytesBuilder::new(), |
| rare_bytes: RareBytesBuilder::new(), |
| packed: pbuilder, |
| } |
| } |
| |
| /// Enable ASCII case insensitivity. When set, byte strings added to this |
| /// builder will be interpreted without respect to ASCII case. |
| pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder { |
| self.ascii_case_insensitive = yes; |
| self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); |
| self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); |
| self |
| } |
| |
| /// Return a prefilter suitable for quickly finding potential matches. |
| /// |
| /// All patterns added to an Aho-Corasick automaton should be added to this |
| /// builder before attempting to construct the prefilter. |
| pub fn build(&self) -> Option<PrefilterObj> { |
| // match (self.start_bytes.build(), self.rare_bytes.build()) { |
| match (self.start_bytes.build(), self.rare_bytes.build()) { |
| // If we could build both start and rare prefilters, then there are |
| // a few cases in which we'd want to use the start-byte prefilter |
| // over the rare-byte prefilter, since the former has lower |
| // overhead. |
| (prestart @ Some(_), prerare @ Some(_)) => { |
| // If the start-byte prefilter can scan for a smaller number |
| // of bytes than the rare-byte prefilter, then it's probably |
| // faster. |
| let has_fewer_bytes = |
| self.start_bytes.count < self.rare_bytes.count; |
| // Otherwise, if the combined frequency rank of the detected |
| // bytes in the start-byte prefilter is "close" to the combined |
| // frequency rank of the rare-byte prefilter, then we pick |
| // the start-byte prefilter even if the rare-byte prefilter |
| // heuristically searches for rare bytes. This is because the |
| // rare-byte prefilter has higher constant costs, so we tend to |
| // prefer the start-byte prefilter when we can. |
| let has_rarer_bytes = |
| self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; |
| if has_fewer_bytes || has_rarer_bytes { |
| prestart |
| } else { |
| prerare |
| } |
| } |
| (prestart @ Some(_), None) => prestart, |
| (None, prerare @ Some(_)) => prerare, |
| (None, None) if self.ascii_case_insensitive => None, |
| (None, None) => self |
| .packed |
| .as_ref() |
| .and_then(|b| b.build()) |
| .map(|s| PrefilterObj::new(Packed(s))), |
| } |
| } |
| |
| /// Add a literal string to this prefilter builder. |
| pub fn add(&mut self, bytes: &[u8]) { |
| self.count += 1; |
| self.start_bytes.add(bytes); |
| self.rare_bytes.add(bytes); |
| if let Some(ref mut pbuilder) = self.packed { |
| pbuilder.add(bytes); |
| } |
| } |
| } |
| |
| /// A type that wraps a packed searcher and implements the `Prefilter` |
| /// interface. |
| #[derive(Clone, Debug)] |
| struct Packed(packed::Searcher); |
| |
| impl Prefilter for Packed { |
| fn next_candidate( |
| &self, |
| _state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| self.0.heap_bytes() |
| } |
| |
| fn reports_false_positives(&self) -> bool { |
| false |
| } |
| } |
| |
| /// A builder for constructing a rare byte prefilter. |
| /// |
| /// A rare byte prefilter attempts to pick out a small set of rare bytes that |
| /// occurr in the patterns, and then quickly scan to matches of those rare |
| /// bytes. |
| #[derive(Clone, Debug)] |
| struct RareBytesBuilder { |
| /// Whether this prefilter should account for ASCII case insensitivity or |
| /// not. |
| ascii_case_insensitive: bool, |
| /// A set of rare bytes, indexed by byte value. |
| rare_set: ByteSet, |
| /// A set of byte offsets associated with bytes in a pattern. An entry |
| /// corresponds to a particular bytes (its index) and is only non-zero if |
| /// the byte occurred at an offset greater than 0 in at least one pattern. |
| /// |
| /// If a byte's offset is not representable in 8 bits, then the rare bytes |
| /// prefilter becomes inert. |
| byte_offsets: RareByteOffsets, |
| /// Whether this is available as a prefilter or not. This can be set to |
| /// false during construction if a condition is seen that invalidates the |
| /// use of the rare-byte prefilter. |
| available: bool, |
| /// The number of bytes set to an active value in `byte_offsets`. |
| count: usize, |
| /// The sum of frequency ranks for the rare bytes detected. This is |
| /// intended to give a heuristic notion of how rare the bytes are. |
| rank_sum: u16, |
| } |
| |
| /// A set of bytes. |
| #[derive(Clone, Copy)] |
| struct ByteSet([bool; 256]); |
| |
| impl ByteSet { |
| fn empty() -> ByteSet { |
| ByteSet([false; 256]) |
| } |
| |
| fn insert(&mut self, b: u8) -> bool { |
| let new = !self.contains(b); |
| self.0[b as usize] = true; |
| new |
| } |
| |
| fn contains(&self, b: u8) -> bool { |
| self.0[b as usize] |
| } |
| } |
| |
| impl fmt::Debug for ByteSet { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let mut bytes = vec![]; |
| for b in 0..=255 { |
| if self.contains(b) { |
| bytes.push(b); |
| } |
| } |
| f.debug_struct("ByteSet").field("set", &bytes).finish() |
| } |
| } |
| |
| /// A set of byte offsets, keyed by byte. |
| #[derive(Clone, Copy)] |
| struct RareByteOffsets { |
| /// Each entry corresponds to the maximum offset of the corresponding |
| /// byte across all patterns seen. |
| set: [RareByteOffset; 256], |
| } |
| |
| impl RareByteOffsets { |
| /// Create a new empty set of rare byte offsets. |
| pub fn empty() -> RareByteOffsets { |
| RareByteOffsets { set: [RareByteOffset::default(); 256] } |
| } |
| |
| /// Add the given offset for the given byte to this set. If the offset is |
| /// greater than the existing offset, then it overwrites the previous |
| /// value and returns false. If there is no previous value set, then this |
| /// sets it and returns true. |
| pub fn set(&mut self, byte: u8, off: RareByteOffset) { |
| self.set[byte as usize].max = |
| cmp::max(self.set[byte as usize].max, off.max); |
| } |
| } |
| |
| impl fmt::Debug for RareByteOffsets { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let mut offsets = vec![]; |
| for off in self.set.iter() { |
| if off.max > 0 { |
| offsets.push(off); |
| } |
| } |
| f.debug_struct("RareByteOffsets").field("set", &offsets).finish() |
| } |
| } |
| |
| /// Offsets associated with an occurrence of a "rare" byte in any of the |
| /// patterns used to construct a single Aho-Corasick automaton. |
| #[derive(Clone, Copy, Debug)] |
| struct RareByteOffset { |
| /// The maximum offset at which a particular byte occurs from the start |
| /// of any pattern. This is used as a shift amount. That is, when an |
| /// occurrence of this byte is found, the candidate position reported by |
| /// the prefilter is `position_of_byte - max`, such that the automaton |
| /// will begin its search at a position that is guaranteed to observe a |
| /// match. |
| /// |
| /// To avoid accidentally quadratic behavior, a prefilter is considered |
| /// ineffective when it is asked to start scanning from a position that it |
| /// has already scanned past. |
| /// |
| /// Using a `u8` here means that if we ever see a pattern that's longer |
| /// than 255 bytes, then the entire rare byte prefilter is disabled. |
| max: u8, |
| } |
| |
| impl Default for RareByteOffset { |
| fn default() -> RareByteOffset { |
| RareByteOffset { max: 0 } |
| } |
| } |
| |
| impl RareByteOffset { |
| /// Create a new rare byte offset. If the given offset is too big, then |
| /// None is returned. In that case, callers should render the rare bytes |
| /// prefilter inert. |
| fn new(max: usize) -> Option<RareByteOffset> { |
| if max > u8::MAX as usize { |
| None |
| } else { |
| Some(RareByteOffset { max: max as u8 }) |
| } |
| } |
| } |
| |
| impl RareBytesBuilder { |
| /// Create a new builder for constructing a rare byte prefilter. |
| fn new() -> RareBytesBuilder { |
| RareBytesBuilder { |
| ascii_case_insensitive: false, |
| rare_set: ByteSet::empty(), |
| byte_offsets: RareByteOffsets::empty(), |
| available: true, |
| count: 0, |
| rank_sum: 0, |
| } |
| } |
| |
| /// Enable ASCII case insensitivity. When set, byte strings added to this |
| /// builder will be interpreted without respect to ASCII case. |
| fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { |
| self.ascii_case_insensitive = yes; |
| self |
| } |
| |
| /// Build the rare bytes prefilter. |
| /// |
| /// If there are more than 3 distinct starting bytes, or if heuristics |
| /// otherwise determine that this prefilter should not be used, then `None` |
| /// is returned. |
| fn build(&self) -> Option<PrefilterObj> { |
| if !self.available || self.count > 3 { |
| return None; |
| } |
| let (mut bytes, mut len) = ([0; 3], 0); |
| for b in 0..=255 { |
| if self.rare_set.contains(b) { |
| bytes[len] = b as u8; |
| len += 1; |
| } |
| } |
| match len { |
| 0 => None, |
| 1 => Some(PrefilterObj::new(RareBytesOne { |
| byte1: bytes[0], |
| offset: self.byte_offsets.set[bytes[0] as usize], |
| })), |
| 2 => Some(PrefilterObj::new(RareBytesTwo { |
| offsets: self.byte_offsets, |
| byte1: bytes[0], |
| byte2: bytes[1], |
| })), |
| 3 => Some(PrefilterObj::new(RareBytesThree { |
| offsets: self.byte_offsets, |
| byte1: bytes[0], |
| byte2: bytes[1], |
| byte3: bytes[2], |
| })), |
| _ => unreachable!(), |
| } |
| } |
| |
| /// Add a byte string to this builder. |
| /// |
| /// All patterns added to an Aho-Corasick automaton should be added to this |
| /// builder before attempting to construct the prefilter. |
| fn add(&mut self, bytes: &[u8]) { |
| // If we've already given up, then do nothing. |
| if !self.available { |
| return; |
| } |
| // If we've already blown our budget, then don't waste time looking |
| // for more rare bytes. |
| if self.count > 3 { |
| self.available = false; |
| return; |
| } |
| // If the pattern is too long, then our offset table is bunk, so |
| // give up. |
| if bytes.len() >= 256 { |
| self.available = false; |
| return; |
| } |
| let mut rarest = match bytes.get(0) { |
| None => return, |
| Some(&b) => (b, freq_rank(b)), |
| }; |
| // The idea here is to look for the rarest byte in each pattern, and |
| // add that to our set. As a special exception, if we see a byte that |
| // we've already added, then we immediately stop and choose that byte, |
| // even if there's another rare byte in the pattern. This helps us |
| // apply the rare byte optimization in more cases by attempting to pick |
| // bytes that are in common between patterns. So for example, if we |
| // were searching for `Sherlock` and `lockjaw`, then this would pick |
| // `k` for both patterns, resulting in the use of `memchr` instead of |
| // `memchr2` for `k` and `j`. |
| let mut found = false; |
| for (pos, &b) in bytes.iter().enumerate() { |
| self.set_offset(pos, b); |
| if found { |
| continue; |
| } |
| if self.rare_set.contains(b) { |
| found = true; |
| continue; |
| } |
| let rank = freq_rank(b); |
| if rank < rarest.1 { |
| rarest = (b, rank); |
| } |
| } |
| if !found { |
| self.add_rare_byte(rarest.0); |
| } |
| } |
| |
| fn set_offset(&mut self, pos: usize, byte: u8) { |
| // This unwrap is OK because pos is never bigger than our max. |
| let offset = RareByteOffset::new(pos).unwrap(); |
| self.byte_offsets.set(byte, offset); |
| if self.ascii_case_insensitive { |
| self.byte_offsets.set(opposite_ascii_case(byte), offset); |
| } |
| } |
| |
| fn add_rare_byte(&mut self, byte: u8) { |
| self.add_one_rare_byte(byte); |
| if self.ascii_case_insensitive { |
| self.add_one_rare_byte(opposite_ascii_case(byte)); |
| } |
| } |
| |
| fn add_one_rare_byte(&mut self, byte: u8) { |
| if self.rare_set.insert(byte) { |
| self.count += 1; |
| self.rank_sum += freq_rank(byte) as u16; |
| } |
| } |
| } |
| |
| /// A prefilter for scanning for a single "rare" byte. |
| #[derive(Clone, Debug)] |
| struct RareBytesOne { |
| byte1: u8, |
| offset: RareByteOffset, |
| } |
| |
| impl Prefilter for RareBytesOne { |
| fn next_candidate( |
| &self, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| memchr(self.byte1, &haystack[at..]) |
| .map(|i| { |
| let pos = at + i; |
| state.last_scan_at = pos; |
| cmp::max(at, pos.saturating_sub(self.offset.max as usize)) |
| }) |
| .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| 0 |
| } |
| |
| fn looks_for_non_start_of_match(&self) -> bool { |
| // TODO: It should be possible to use a rare byte prefilter in a |
| // streaming context. The main problem is that we usually assume that |
| // if a prefilter has scanned some text and not found anything, then no |
| // match *starts* in that text. This doesn't matter in non-streaming |
| // contexts, but in a streaming context, if we're looking for a byte |
| // that doesn't start at the beginning of a match and don't find it, |
| // then it's still possible for a match to start at the end of the |
| // current buffer content. In order to fix this, the streaming searcher |
| // would need to become aware of prefilters that do this and use the |
| // appropriate offset in various places. It is quite a delicate change |
| // and probably shouldn't be attempted until streaming search has a |
| // better testing strategy. In particular, we'd really like to be able |
| // to vary the buffer size to force strange cases that occur at the |
| // edge of the buffer. If we make the buffer size minimal, then these |
| // cases occur more frequently and easier. |
| // |
| // This is also a bummer because this means that if the prefilter |
| // builder chose a rare byte prefilter, then a streaming search won't |
| // use any prefilter at all because the builder doesn't know how it's |
| // going to be used. Assuming we don't make streaming search aware of |
| // these special types of prefilters as described above, we could fix |
| // this by building a "backup" prefilter that could be used when the |
| // rare byte prefilter could not. But that's a bandaide. Sigh. |
| true |
| } |
| } |
| |
| /// A prefilter for scanning for two "rare" bytes. |
| #[derive(Clone, Debug)] |
| struct RareBytesTwo { |
| offsets: RareByteOffsets, |
| byte1: u8, |
| byte2: u8, |
| } |
| |
| impl Prefilter for RareBytesTwo { |
| fn next_candidate( |
| &self, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| memchr2(self.byte1, self.byte2, &haystack[at..]) |
| .map(|i| { |
| let pos = at + i; |
| state.update_at(pos); |
| let offset = self.offsets.set[haystack[pos] as usize].max; |
| cmp::max(at, pos.saturating_sub(offset as usize)) |
| }) |
| .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| 0 |
| } |
| |
| fn looks_for_non_start_of_match(&self) -> bool { |
| // TODO: See Prefilter impl for RareBytesOne. |
| true |
| } |
| } |
| |
| /// A prefilter for scanning for three "rare" bytes. |
| #[derive(Clone, Debug)] |
| struct RareBytesThree { |
| offsets: RareByteOffsets, |
| byte1: u8, |
| byte2: u8, |
| byte3: u8, |
| } |
| |
| impl Prefilter for RareBytesThree { |
| fn next_candidate( |
| &self, |
| state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) |
| .map(|i| { |
| let pos = at + i; |
| state.update_at(pos); |
| let offset = self.offsets.set[haystack[pos] as usize].max; |
| cmp::max(at, pos.saturating_sub(offset as usize)) |
| }) |
| .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| 0 |
| } |
| |
| fn looks_for_non_start_of_match(&self) -> bool { |
| // TODO: See Prefilter impl for RareBytesOne. |
| true |
| } |
| } |
| |
| /// A builder for constructing a starting byte prefilter. |
| /// |
| /// A starting byte prefilter is a simplistic prefilter that looks for possible |
| /// matches by reporting all positions corresponding to a particular byte. This |
| /// generally only takes affect when there are at most 3 distinct possible |
| /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two |
| /// distinct starting bytes (`f` and `b`), and this prefilter returns all |
| /// occurrences of either `f` or `b`. |
| /// |
| /// In some cases, a heuristic frequency analysis may determine that it would |
| /// be better not to use this prefilter even when there are 3 or fewer distinct |
| /// starting bytes. |
| #[derive(Clone, Debug)] |
| struct StartBytesBuilder { |
| /// Whether this prefilter should account for ASCII case insensitivity or |
| /// not. |
| ascii_case_insensitive: bool, |
| /// The set of starting bytes observed. |
| byteset: Vec<bool>, |
| /// The number of bytes set to true in `byteset`. |
| count: usize, |
| /// The sum of frequency ranks for the rare bytes detected. This is |
| /// intended to give a heuristic notion of how rare the bytes are. |
| rank_sum: u16, |
| } |
| |
| impl StartBytesBuilder { |
| /// Create a new builder for constructing a start byte prefilter. |
| fn new() -> StartBytesBuilder { |
| StartBytesBuilder { |
| ascii_case_insensitive: false, |
| byteset: vec![false; 256], |
| count: 0, |
| rank_sum: 0, |
| } |
| } |
| |
| /// Enable ASCII case insensitivity. When set, byte strings added to this |
| /// builder will be interpreted without respect to ASCII case. |
| fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { |
| self.ascii_case_insensitive = yes; |
| self |
| } |
| |
| /// Build the starting bytes prefilter. |
| /// |
| /// If there are more than 3 distinct starting bytes, or if heuristics |
| /// otherwise determine that this prefilter should not be used, then `None` |
| /// is returned. |
| fn build(&self) -> Option<PrefilterObj> { |
| if self.count > 3 { |
| return None; |
| } |
| let (mut bytes, mut len) = ([0; 3], 0); |
| for b in 0..256 { |
| if !self.byteset[b] { |
| continue; |
| } |
| // We don't handle non-ASCII bytes for now. Getting non-ASCII |
| // bytes right is trickier, since we generally don't want to put |
| // a leading UTF-8 code unit into a prefilter that isn't ASCII, |
| // since they can frequently. Instead, it would be better to use a |
| // continuation byte, but this requires more sophisticated analysis |
| // of the automaton and a richer prefilter API. |
| if b > 0x7F { |
| return None; |
| } |
| bytes[len] = b as u8; |
| len += 1; |
| } |
| match len { |
| 0 => None, |
| 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })), |
| 2 => Some(PrefilterObj::new(StartBytesTwo { |
| byte1: bytes[0], |
| byte2: bytes[1], |
| })), |
| 3 => Some(PrefilterObj::new(StartBytesThree { |
| byte1: bytes[0], |
| byte2: bytes[1], |
| byte3: bytes[2], |
| })), |
| _ => unreachable!(), |
| } |
| } |
| |
| /// Add a byte string to this builder. |
| /// |
| /// All patterns added to an Aho-Corasick automaton should be added to this |
| /// builder before attempting to construct the prefilter. |
| fn add(&mut self, bytes: &[u8]) { |
| if self.count > 3 { |
| return; |
| } |
| if let Some(&byte) = bytes.get(0) { |
| self.add_one_byte(byte); |
| if self.ascii_case_insensitive { |
| self.add_one_byte(opposite_ascii_case(byte)); |
| } |
| } |
| } |
| |
| fn add_one_byte(&mut self, byte: u8) { |
| if !self.byteset[byte as usize] { |
| self.byteset[byte as usize] = true; |
| self.count += 1; |
| self.rank_sum += freq_rank(byte) as u16; |
| } |
| } |
| } |
| |
| /// A prefilter for scanning for a single starting byte. |
| #[derive(Clone, Debug)] |
| struct StartBytesOne { |
| byte1: u8, |
| } |
| |
| impl Prefilter for StartBytesOne { |
| fn next_candidate( |
| &self, |
| _state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| memchr(self.byte1, &haystack[at..]) |
| .map(|i| at + i) |
| .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| 0 |
| } |
| } |
| |
| /// A prefilter for scanning for two starting bytes. |
| #[derive(Clone, Debug)] |
| struct StartBytesTwo { |
| byte1: u8, |
| byte2: u8, |
| } |
| |
| impl Prefilter for StartBytesTwo { |
| fn next_candidate( |
| &self, |
| _state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| memchr2(self.byte1, self.byte2, &haystack[at..]) |
| .map(|i| at + i) |
| .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| 0 |
| } |
| } |
| |
| /// A prefilter for scanning for three starting bytes. |
| #[derive(Clone, Debug)] |
| struct StartBytesThree { |
| byte1: u8, |
| byte2: u8, |
| byte3: u8, |
| } |
| |
| impl Prefilter for StartBytesThree { |
| fn next_candidate( |
| &self, |
| _state: &mut PrefilterState, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) |
| .map(|i| at + i) |
| .map_or(Candidate::None, Candidate::PossibleStartOfMatch) |
| } |
| |
| fn clone_prefilter(&self) -> Box<dyn Prefilter> { |
| Box::new(self.clone()) |
| } |
| |
| fn heap_bytes(&self) -> usize { |
| 0 |
| } |
| } |
| |
| /// Return the next candidate reported by the given prefilter while |
| /// simultaneously updating the given prestate. |
| /// |
| /// The caller is responsible for checking the prestate before deciding whether |
| /// to initiate a search. |
| #[inline] |
| pub fn next<P: Prefilter>( |
| prestate: &mut PrefilterState, |
| prefilter: P, |
| haystack: &[u8], |
| at: usize, |
| ) -> Candidate { |
| let cand = prefilter.next_candidate(prestate, haystack, at); |
| match cand { |
| Candidate::None => { |
| prestate.update_skipped_bytes(haystack.len() - at); |
| } |
| Candidate::Match(ref m) => { |
| prestate.update_skipped_bytes(m.start() - at); |
| } |
| Candidate::PossibleStartOfMatch(i) => { |
| prestate.update_skipped_bytes(i - at); |
| } |
| } |
| cand |
| } |
| |
| /// If the given byte is an ASCII letter, then return it in the opposite case. |
| /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns |
| /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. |
| pub fn opposite_ascii_case(b: u8) -> u8 { |
| if b'A' <= b && b <= b'Z' { |
| b.to_ascii_lowercase() |
| } else if b'a' <= b && b <= b'z' { |
| b.to_ascii_uppercase() |
| } else { |
| b |
| } |
| } |
| |
| /// Return the frequency rank of the given byte. The higher the rank, the more |
| /// common the byte (heuristically speaking). |
| fn freq_rank(b: u8) -> u8 { |
| use crate::byte_frequencies::BYTE_FREQUENCIES; |
| BYTE_FREQUENCIES[b as usize] |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| fn scratch() { |
| let mut b = Builder::new(MatchKind::LeftmostFirst); |
| b.add(b"Sherlock"); |
| b.add(b"locjaw"); |
| // b.add(b"Sherlock"); |
| // b.add(b"Holmes"); |
| // b.add(b"Watson"); |
| // b.add("Шерлок Холмс".as_bytes()); |
| // b.add("Джон Уотсон".as_bytes()); |
| |
| let s = b.build().unwrap(); |
| println!("{:?}", s); |
| } |
| } |