| //! Boyer-Moore-Horspool |
| //! |
| |
| extern crate memchr; |
| use std::cmp; |
| |
| fn bmh_skip(pat: &[u8], skip: &mut [u16; 256]) { |
| let pat_skip = cmp::min(pat.len(), u16::max_value() as usize) as u16; |
| for entry in skip.iter_mut() { |
| *entry = pat_skip; |
| } |
| |
| for (index, &byte) in pat[..pat.len() - 1].iter().enumerate() { |
| skip[byte as usize] = cmp::min(pat.len() - index - 1, u16::max_value() as usize) as u16; |
| } |
| } |
| |
| /// Boyer-Moore-Horspool substring search |
| pub fn find(text: &[u8], pat: &[u8]) -> Option<usize> { |
| let mut skip = [0; 256]; |
| bmh_skip(pat, &mut skip); |
| |
| let pat_len = pat.len(); |
| |
| if pat_len == 0 { |
| return Some(0); |
| } |
| |
| let pat_len_m1 = pat_len - 1; |
| let pat_last = pat[pat_len - 1]; |
| |
| // initial search by memchr |
| let mut j = match memchr::memchr(pat[0], text) { |
| Some(x) => x, |
| None => return None, |
| }; |
| |
| while let Some(&c) = text.get(j + pat_len_m1) { |
| // check the back character of the pattern |
| if c == pat_last && &text[j..j + pat_len] == pat { |
| return Some(j); |
| } |
| j += skip[c as usize] as usize; |
| } |
| None |
| } |
| |
| #[test] |
| fn bmh_preprocess() { |
| let mut skip = [0; 256]; |
| let needle = b"gcagagag"; |
| bmh_skip(needle, &mut skip); |
| assert_eq!(skip[b'g' as usize], 2); |
| assert_eq!(skip[b'c' as usize], 6); |
| assert_eq!(skip[b'a' as usize], 1); |
| assert_eq!(skip[b't' as usize], 8); |
| } |
| |
| #[test] |
| fn bmh_find() { |
| let text = b"abc"; |
| assert_eq!(find(text, b"d"), None); |
| assert_eq!(find(text, b"c"), Some(2)); |
| |
| let longer = "longer text and so on"; |
| |
| // test all windows |
| for wsz in 1..17 { |
| for window in longer.as_bytes().windows(wsz) { |
| let str_find = longer.find(::std::str::from_utf8(window).unwrap()); |
| assert!(str_find.is_some()); |
| assert_eq!(find(longer.as_bytes(), window), str_find); |
| } |
| } |
| |
| let pat = b"ger text and so on"; |
| assert!(pat.len() > 16); |
| assert_eq!(Some(3), find(longer.as_bytes(), pat)); |
| } |