blob: ef814117cc6db0d23ef47c0db60815c8006fa42e [file] [log] [blame]
use std::cmp;
use std::fmt;
use std::panic::{RefUnwindSafe, UnwindSafe};
use std::u8;
use memchr::{memchr, memchr2, memchr3};
use crate::ahocorasick::MatchKind;
use crate::packed;
use crate::Match;
/// A candidate is the result of running a prefilter on a haystack at a
/// particular position. The result is either no match, a confirmed match or
/// a possible match.
///
/// When no match is returned, the prefilter is guaranteeing that no possible
/// match can be found in the haystack, and the caller may trust this. That is,
/// all correct prefilters must never report false negatives.
///
/// In some cases, a prefilter can confirm a match very quickly, in which case,
/// the caller may use this to stop what it's doing and report the match. In
/// this case, prefilter implementations must never report a false positive.
/// In other cases, the prefilter can only report a potential match, in which
/// case the callers must attempt to confirm the match. In this case, prefilter
/// implementations are permitted to return false positives.
#[derive(Clone, Debug)]
pub enum Candidate {
None,
Match(Match),
PossibleStartOfMatch(usize),
}
impl Candidate {
/// Convert this candidate into an option. This is useful when callers
/// do not distinguish between true positives and false positives (i.e.,
/// the caller must always confirm the match in order to update some other
/// state).
pub fn into_option(self) -> Option<usize> {
match self {
Candidate::None => None,
Candidate::Match(ref m) => Some(m.start()),
Candidate::PossibleStartOfMatch(start) => Some(start),
}
}
}
/// A prefilter describes the behavior of fast literal scanners for quickly
/// skipping past bytes in the haystack that we know cannot possibly
/// participate in a match.
pub trait Prefilter:
Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
{
/// Returns the next possible match candidate. This may yield false
/// positives, so callers must confirm a match starting at the position
/// returned. This, however, must never produce false negatives. That is,
/// this must, at minimum, return the starting position of the next match
/// in the given haystack after or at the given position.
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate;
/// A method for cloning a prefilter, to work-around the fact that Clone
/// is not object-safe.
fn clone_prefilter(&self) -> Box<dyn Prefilter>;
/// Returns the approximate total amount of heap used by this prefilter, in
/// units of bytes.
fn heap_bytes(&self) -> usize;
/// Returns true if and only if this prefilter never returns false
/// positives. This is useful for completely avoiding the automaton
/// when the prefilter can quickly confirm its own matches.
///
/// By default, this returns true, which is conservative; it is always
/// correct to return `true`. Returning `false` here and reporting a false
/// positive will result in incorrect searches.
fn reports_false_positives(&self) -> bool {
true
}
/// Returns true if and only if this prefilter may look for a non-starting
/// position of a match.
///
/// This is useful in a streaming context where prefilters that don't look
/// for a starting position of a match can be quite difficult to deal with.
///
/// This returns false by default.
fn looks_for_non_start_of_match(&self) -> bool {
false
}
}
impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
#[inline]
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
(**self).next_candidate(state, haystack, at)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
(**self).clone_prefilter()
}
fn heap_bytes(&self) -> usize {
(**self).heap_bytes()
}
fn reports_false_positives(&self) -> bool {
(**self).reports_false_positives()
}
}
/// A convenience object for representing any type that implements Prefilter
/// and is cloneable.
#[derive(Debug)]
pub struct PrefilterObj(Box<dyn Prefilter>);
impl Clone for PrefilterObj {
fn clone(&self) -> Self {
PrefilterObj(self.0.clone_prefilter())
}
}
impl PrefilterObj {
/// Create a new prefilter object.
pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
PrefilterObj(Box::new(t))
}
/// Return the underlying prefilter trait object.
pub fn as_ref(&self) -> &dyn Prefilter {
&*self.0
}
}
/// PrefilterState tracks state associated with the effectiveness of a
/// prefilter. It is used to track how many bytes, on average, are skipped by
/// the prefilter. If this average dips below a certain threshold over time,
/// then the state renders the prefilter inert and stops using it.
///
/// A prefilter state should be created for each search. (Where creating an
/// iterator via, e.g., `find_iter`, is treated as a single search.)
#[derive(Clone, Debug)]
pub struct PrefilterState {
/// The number of skips that has been executed.
skips: usize,
/// The total number of bytes that have been skipped.
skipped: usize,
/// The maximum length of a match. This is used to help determine how many
/// bytes on average should be skipped in order for a prefilter to be
/// effective.
max_match_len: usize,
/// Once this heuristic has been deemed permanently ineffective, it will be
/// inert throughout the rest of its lifetime. This serves as a cheap way
/// to check inertness.
inert: bool,
/// The last (absolute) position at which a prefilter scanned to.
/// Prefilters can use this position to determine whether to re-scan or
/// not.
///
/// Unlike other things that impact effectiveness, this is a fleeting
/// condition. That is, a prefilter can be considered ineffective if it is
/// at a position before `last_scan_at`, but can become effective again
/// once the search moves past `last_scan_at`.
///
/// The utility of this is to both avoid additional overhead from calling
/// the prefilter and to avoid quadratic behavior. This ensures that a
/// prefilter will scan any particular byte at most once. (Note that some
/// prefilters, like the start-byte prefilter, do not need to use this
/// field at all, since it only looks for starting bytes.)
last_scan_at: usize,
}
impl PrefilterState {
/// The minimum number of skip attempts to try before considering whether
/// a prefilter is effective or not.
const MIN_SKIPS: usize = 40;
/// The minimum amount of bytes that skipping must average, expressed as a
/// factor of the multiple of the length of a possible match.
///
/// That is, after MIN_SKIPS have occurred, if the average number of bytes
/// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
/// prefilter outed to be rendered inert.
const MIN_AVG_FACTOR: usize = 2;
/// Create a fresh prefilter state.
pub fn new(max_match_len: usize) -> PrefilterState {
PrefilterState {
skips: 0,
skipped: 0,
max_match_len,
inert: false,
last_scan_at: 0,
}
}
/// Create a prefilter state that always disables the prefilter.
pub fn disabled() -> PrefilterState {
PrefilterState {
skips: 0,
skipped: 0,
max_match_len: 0,
inert: true,
last_scan_at: 0,
}
}
/// Update this state with the number of bytes skipped on the last
/// invocation of the prefilter.
#[inline]
fn update_skipped_bytes(&mut self, skipped: usize) {
self.skips += 1;
self.skipped += skipped;
}
/// Updates the position at which the last scan stopped. This may be
/// greater than the position of the last candidate reported. For example,
/// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
/// will report a candidate at position `0`, but the end of its last scan
/// will be at position `3`.
///
/// This position factors into the effectiveness of this prefilter. If the
/// current position is less than the last position at which a scan ended,
/// then the prefilter should not be re-run until the search moves past
/// that position.
#[inline]
fn update_at(&mut self, at: usize) {
if at > self.last_scan_at {
self.last_scan_at = at;
}
}
/// Return true if and only if this state indicates that a prefilter is
/// still effective.
///
/// The given pos should correspond to the current starting position of the
/// search.
#[inline]
pub fn is_effective(&mut self, at: usize) -> bool {
if self.inert {
return false;
}
if at < self.last_scan_at {
return false;
}
if self.skips < PrefilterState::MIN_SKIPS {
return true;
}
let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
if self.skipped >= min_avg * self.skips {
return true;
}
// We're inert.
self.inert = true;
false
}
}
/// A builder for constructing the best possible prefilter. When constructed,
/// this builder will heuristically select the best prefilter it can build,
/// if any, and discard the rest.
#[derive(Debug)]
pub struct Builder {
count: usize,
ascii_case_insensitive: bool,
start_bytes: StartBytesBuilder,
rare_bytes: RareBytesBuilder,
packed: Option<packed::Builder>,
}
impl Builder {
/// Create a new builder for constructing the best possible prefilter.
pub fn new(kind: MatchKind) -> Builder {
let pbuilder = kind
.as_packed()
.map(|kind| packed::Config::new().match_kind(kind).builder());
Builder {
count: 0,
ascii_case_insensitive: false,
start_bytes: StartBytesBuilder::new(),
rare_bytes: RareBytesBuilder::new(),
packed: pbuilder,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
self.ascii_case_insensitive = yes;
self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
self
}
/// Return a prefilter suitable for quickly finding potential matches.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
pub fn build(&self) -> Option<PrefilterObj> {
// match (self.start_bytes.build(), self.rare_bytes.build()) {
match (self.start_bytes.build(), self.rare_bytes.build()) {
// If we could build both start and rare prefilters, then there are
// a few cases in which we'd want to use the start-byte prefilter
// over the rare-byte prefilter, since the former has lower
// overhead.
(prestart @ Some(_), prerare @ Some(_)) => {
// If the start-byte prefilter can scan for a smaller number
// of bytes than the rare-byte prefilter, then it's probably
// faster.
let has_fewer_bytes =
self.start_bytes.count < self.rare_bytes.count;
// Otherwise, if the combined frequency rank of the detected
// bytes in the start-byte prefilter is "close" to the combined
// frequency rank of the rare-byte prefilter, then we pick
// the start-byte prefilter even if the rare-byte prefilter
// heuristically searches for rare bytes. This is because the
// rare-byte prefilter has higher constant costs, so we tend to
// prefer the start-byte prefilter when we can.
let has_rarer_bytes =
self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
if has_fewer_bytes || has_rarer_bytes {
prestart
} else {
prerare
}
}
(prestart @ Some(_), None) => prestart,
(None, prerare @ Some(_)) => prerare,
(None, None) if self.ascii_case_insensitive => None,
(None, None) => self
.packed
.as_ref()
.and_then(|b| b.build())
.map(|s| PrefilterObj::new(Packed(s))),
}
}
/// Add a literal string to this prefilter builder.
pub fn add(&mut self, bytes: &[u8]) {
self.count += 1;
self.start_bytes.add(bytes);
self.rare_bytes.add(bytes);
if let Some(ref mut pbuilder) = self.packed {
pbuilder.add(bytes);
}
}
}
/// A type that wraps a packed searcher and implements the `Prefilter`
/// interface.
#[derive(Clone, Debug)]
struct Packed(packed::Searcher);
impl Prefilter for Packed {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
self.0.heap_bytes()
}
fn reports_false_positives(&self) -> bool {
false
}
}
/// A builder for constructing a rare byte prefilter.
///
/// A rare byte prefilter attempts to pick out a small set of rare bytes that
/// occurr in the patterns, and then quickly scan to matches of those rare
/// bytes.
#[derive(Clone, Debug)]
struct RareBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// A set of rare bytes, indexed by byte value.
rare_set: ByteSet,
/// A set of byte offsets associated with bytes in a pattern. An entry
/// corresponds to a particular bytes (its index) and is only non-zero if
/// the byte occurred at an offset greater than 0 in at least one pattern.
///
/// If a byte's offset is not representable in 8 bits, then the rare bytes
/// prefilter becomes inert.
byte_offsets: RareByteOffsets,
/// Whether this is available as a prefilter or not. This can be set to
/// false during construction if a condition is seen that invalidates the
/// use of the rare-byte prefilter.
available: bool,
/// The number of bytes set to an active value in `byte_offsets`.
count: usize,
/// The sum of frequency ranks for the rare bytes detected. This is
/// intended to give a heuristic notion of how rare the bytes are.
rank_sum: u16,
}
/// A set of bytes.
#[derive(Clone, Copy)]
struct ByteSet([bool; 256]);
impl ByteSet {
fn empty() -> ByteSet {
ByteSet([false; 256])
}
fn insert(&mut self, b: u8) -> bool {
let new = !self.contains(b);
self.0[b as usize] = true;
new
}
fn contains(&self, b: u8) -> bool {
self.0[b as usize]
}
}
impl fmt::Debug for ByteSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut bytes = vec![];
for b in 0..=255 {
if self.contains(b) {
bytes.push(b);
}
}
f.debug_struct("ByteSet").field("set", &bytes).finish()
}
}
/// A set of byte offsets, keyed by byte.
#[derive(Clone, Copy)]
struct RareByteOffsets {
/// Each entry corresponds to the maximum offset of the corresponding
/// byte across all patterns seen.
set: [RareByteOffset; 256],
}
impl RareByteOffsets {
/// Create a new empty set of rare byte offsets.
pub fn empty() -> RareByteOffsets {
RareByteOffsets { set: [RareByteOffset::default(); 256] }
}
/// Add the given offset for the given byte to this set. If the offset is
/// greater than the existing offset, then it overwrites the previous
/// value and returns false. If there is no previous value set, then this
/// sets it and returns true.
pub fn set(&mut self, byte: u8, off: RareByteOffset) {
self.set[byte as usize].max =
cmp::max(self.set[byte as usize].max, off.max);
}
}
impl fmt::Debug for RareByteOffsets {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut offsets = vec![];
for off in self.set.iter() {
if off.max > 0 {
offsets.push(off);
}
}
f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
}
}
/// Offsets associated with an occurrence of a "rare" byte in any of the
/// patterns used to construct a single Aho-Corasick automaton.
#[derive(Clone, Copy, Debug)]
struct RareByteOffset {
/// The maximum offset at which a particular byte occurs from the start
/// of any pattern. This is used as a shift amount. That is, when an
/// occurrence of this byte is found, the candidate position reported by
/// the prefilter is `position_of_byte - max`, such that the automaton
/// will begin its search at a position that is guaranteed to observe a
/// match.
///
/// To avoid accidentally quadratic behavior, a prefilter is considered
/// ineffective when it is asked to start scanning from a position that it
/// has already scanned past.
///
/// Using a `u8` here means that if we ever see a pattern that's longer
/// than 255 bytes, then the entire rare byte prefilter is disabled.
max: u8,
}
impl Default for RareByteOffset {
fn default() -> RareByteOffset {
RareByteOffset { max: 0 }
}
}
impl RareByteOffset {
/// Create a new rare byte offset. If the given offset is too big, then
/// None is returned. In that case, callers should render the rare bytes
/// prefilter inert.
fn new(max: usize) -> Option<RareByteOffset> {
if max > u8::MAX as usize {
None
} else {
Some(RareByteOffset { max: max as u8 })
}
}
}
impl RareBytesBuilder {
/// Create a new builder for constructing a rare byte prefilter.
fn new() -> RareBytesBuilder {
RareBytesBuilder {
ascii_case_insensitive: false,
rare_set: ByteSet::empty(),
byte_offsets: RareByteOffsets::empty(),
available: true,
count: 0,
rank_sum: 0,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
/// Build the rare bytes prefilter.
///
/// If there are more than 3 distinct starting bytes, or if heuristics
/// otherwise determine that this prefilter should not be used, then `None`
/// is returned.
fn build(&self) -> Option<PrefilterObj> {
if !self.available || self.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..=255 {
if self.rare_set.contains(b) {
bytes[len] = b as u8;
len += 1;
}
}
match len {
0 => None,
1 => Some(PrefilterObj::new(RareBytesOne {
byte1: bytes[0],
offset: self.byte_offsets.set[bytes[0] as usize],
})),
2 => Some(PrefilterObj::new(RareBytesTwo {
offsets: self.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
})),
3 => Some(PrefilterObj::new(RareBytesThree {
offsets: self.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
})),
_ => unreachable!(),
}
}
/// Add a byte string to this builder.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
// If we've already given up, then do nothing.
if !self.available {
return;
}
// If we've already blown our budget, then don't waste time looking
// for more rare bytes.
if self.count > 3 {
self.available = false;
return;
}
// If the pattern is too long, then our offset table is bunk, so
// give up.
if bytes.len() >= 256 {
self.available = false;
return;
}
let mut rarest = match bytes.get(0) {
None => return,
Some(&b) => (b, freq_rank(b)),
};
// The idea here is to look for the rarest byte in each pattern, and
// add that to our set. As a special exception, if we see a byte that
// we've already added, then we immediately stop and choose that byte,
// even if there's another rare byte in the pattern. This helps us
// apply the rare byte optimization in more cases by attempting to pick
// bytes that are in common between patterns. So for example, if we
// were searching for `Sherlock` and `lockjaw`, then this would pick
// `k` for both patterns, resulting in the use of `memchr` instead of
// `memchr2` for `k` and `j`.
let mut found = false;
for (pos, &b) in bytes.iter().enumerate() {
self.set_offset(pos, b);
if found {
continue;
}
if self.rare_set.contains(b) {
found = true;
continue;
}
let rank = freq_rank(b);
if rank < rarest.1 {
rarest = (b, rank);
}
}
if !found {
self.add_rare_byte(rarest.0);
}
}
fn set_offset(&mut self, pos: usize, byte: u8) {
// This unwrap is OK because pos is never bigger than our max.
let offset = RareByteOffset::new(pos).unwrap();
self.byte_offsets.set(byte, offset);
if self.ascii_case_insensitive {
self.byte_offsets.set(opposite_ascii_case(byte), offset);
}
}
fn add_rare_byte(&mut self, byte: u8) {
self.add_one_rare_byte(byte);
if self.ascii_case_insensitive {
self.add_one_rare_byte(opposite_ascii_case(byte));
}
}
fn add_one_rare_byte(&mut self, byte: u8) {
if self.rare_set.insert(byte) {
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
/// A prefilter for scanning for a single "rare" byte.
#[derive(Clone, Debug)]
struct RareBytesOne {
byte1: u8,
offset: RareByteOffset,
}
impl Prefilter for RareBytesOne {
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr(self.byte1, &haystack[at..])
.map(|i| {
let pos = at + i;
state.last_scan_at = pos;
cmp::max(at, pos.saturating_sub(self.offset.max as usize))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
fn looks_for_non_start_of_match(&self) -> bool {
// TODO: It should be possible to use a rare byte prefilter in a
// streaming context. The main problem is that we usually assume that
// if a prefilter has scanned some text and not found anything, then no
// match *starts* in that text. This doesn't matter in non-streaming
// contexts, but in a streaming context, if we're looking for a byte
// that doesn't start at the beginning of a match and don't find it,
// then it's still possible for a match to start at the end of the
// current buffer content. In order to fix this, the streaming searcher
// would need to become aware of prefilters that do this and use the
// appropriate offset in various places. It is quite a delicate change
// and probably shouldn't be attempted until streaming search has a
// better testing strategy. In particular, we'd really like to be able
// to vary the buffer size to force strange cases that occur at the
// edge of the buffer. If we make the buffer size minimal, then these
// cases occur more frequently and easier.
//
// This is also a bummer because this means that if the prefilter
// builder chose a rare byte prefilter, then a streaming search won't
// use any prefilter at all because the builder doesn't know how it's
// going to be used. Assuming we don't make streaming search aware of
// these special types of prefilters as described above, we could fix
// this by building a "backup" prefilter that could be used when the
// rare byte prefilter could not. But that's a bandaide. Sigh.
true
}
}
/// A prefilter for scanning for two "rare" bytes.
#[derive(Clone, Debug)]
struct RareBytesTwo {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
}
impl Prefilter for RareBytesTwo {
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr2(self.byte1, self.byte2, &haystack[at..])
.map(|i| {
let pos = at + i;
state.update_at(pos);
let offset = self.offsets.set[haystack[pos] as usize].max;
cmp::max(at, pos.saturating_sub(offset as usize))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
fn looks_for_non_start_of_match(&self) -> bool {
// TODO: See Prefilter impl for RareBytesOne.
true
}
}
/// A prefilter for scanning for three "rare" bytes.
#[derive(Clone, Debug)]
struct RareBytesThree {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
byte3: u8,
}
impl Prefilter for RareBytesThree {
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
.map(|i| {
let pos = at + i;
state.update_at(pos);
let offset = self.offsets.set[haystack[pos] as usize].max;
cmp::max(at, pos.saturating_sub(offset as usize))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
fn looks_for_non_start_of_match(&self) -> bool {
// TODO: See Prefilter impl for RareBytesOne.
true
}
}
/// A builder for constructing a starting byte prefilter.
///
/// A starting byte prefilter is a simplistic prefilter that looks for possible
/// matches by reporting all positions corresponding to a particular byte. This
/// generally only takes affect when there are at most 3 distinct possible
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
/// distinct starting bytes (`f` and `b`), and this prefilter returns all
/// occurrences of either `f` or `b`.
///
/// In some cases, a heuristic frequency analysis may determine that it would
/// be better not to use this prefilter even when there are 3 or fewer distinct
/// starting bytes.
#[derive(Clone, Debug)]
struct StartBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// The set of starting bytes observed.
byteset: Vec<bool>,
/// The number of bytes set to true in `byteset`.
count: usize,
/// The sum of frequency ranks for the rare bytes detected. This is
/// intended to give a heuristic notion of how rare the bytes are.
rank_sum: u16,
}
impl StartBytesBuilder {
/// Create a new builder for constructing a start byte prefilter.
fn new() -> StartBytesBuilder {
StartBytesBuilder {
ascii_case_insensitive: false,
byteset: vec![false; 256],
count: 0,
rank_sum: 0,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
/// Build the starting bytes prefilter.
///
/// If there are more than 3 distinct starting bytes, or if heuristics
/// otherwise determine that this prefilter should not be used, then `None`
/// is returned.
fn build(&self) -> Option<PrefilterObj> {
if self.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..256 {
if !self.byteset[b] {
continue;
}
// We don't handle non-ASCII bytes for now. Getting non-ASCII
// bytes right is trickier, since we generally don't want to put
// a leading UTF-8 code unit into a prefilter that isn't ASCII,
// since they can frequently. Instead, it would be better to use a
// continuation byte, but this requires more sophisticated analysis
// of the automaton and a richer prefilter API.
if b > 0x7F {
return None;
}
bytes[len] = b as u8;
len += 1;
}
match len {
0 => None,
1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
2 => Some(PrefilterObj::new(StartBytesTwo {
byte1: bytes[0],
byte2: bytes[1],
})),
3 => Some(PrefilterObj::new(StartBytesThree {
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
})),
_ => unreachable!(),
}
}
/// Add a byte string to this builder.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
if self.count > 3 {
return;
}
if let Some(&byte) = bytes.get(0) {
self.add_one_byte(byte);
if self.ascii_case_insensitive {
self.add_one_byte(opposite_ascii_case(byte));
}
}
}
fn add_one_byte(&mut self, byte: u8) {
if !self.byteset[byte as usize] {
self.byteset[byte as usize] = true;
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
/// A prefilter for scanning for a single starting byte.
#[derive(Clone, Debug)]
struct StartBytesOne {
byte1: u8,
}
impl Prefilter for StartBytesOne {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr(self.byte1, &haystack[at..])
.map(|i| at + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
/// A prefilter for scanning for two starting bytes.
#[derive(Clone, Debug)]
struct StartBytesTwo {
byte1: u8,
byte2: u8,
}
impl Prefilter for StartBytesTwo {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr2(self.byte1, self.byte2, &haystack[at..])
.map(|i| at + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
/// A prefilter for scanning for three starting bytes.
#[derive(Clone, Debug)]
struct StartBytesThree {
byte1: u8,
byte2: u8,
byte3: u8,
}
impl Prefilter for StartBytesThree {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
.map(|i| at + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
/// Return the next candidate reported by the given prefilter while
/// simultaneously updating the given prestate.
///
/// The caller is responsible for checking the prestate before deciding whether
/// to initiate a search.
#[inline]
pub fn next<P: Prefilter>(
prestate: &mut PrefilterState,
prefilter: P,
haystack: &[u8],
at: usize,
) -> Candidate {
let cand = prefilter.next_candidate(prestate, haystack, at);
match cand {
Candidate::None => {
prestate.update_skipped_bytes(haystack.len() - at);
}
Candidate::Match(ref m) => {
prestate.update_skipped_bytes(m.start() - at);
}
Candidate::PossibleStartOfMatch(i) => {
prestate.update_skipped_bytes(i - at);
}
}
cand
}
/// If the given byte is an ASCII letter, then return it in the opposite case.
/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
pub fn opposite_ascii_case(b: u8) -> u8 {
if b'A' <= b && b <= b'Z' {
b.to_ascii_lowercase()
} else if b'a' <= b && b <= b'z' {
b.to_ascii_uppercase()
} else {
b
}
}
/// Return the frequency rank of the given byte. The higher the rank, the more
/// common the byte (heuristically speaking).
fn freq_rank(b: u8) -> u8 {
use crate::byte_frequencies::BYTE_FREQUENCIES;
BYTE_FREQUENCIES[b as usize]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn scratch() {
let mut b = Builder::new(MatchKind::LeftmostFirst);
b.add(b"Sherlock");
b.add(b"locjaw");
// b.add(b"Sherlock");
// b.add(b"Holmes");
// b.add(b"Watson");
// b.add("Шерлок Холмс".as_bytes());
// b.add("Джон Уотсон".as_bytes());
let s = b.build().unwrap();
println!("{:?}", s);
}
}