blob: b620cc284a7d9d7c999d012e3be0a82d96e9efd7 [file] [log] [blame]
// This example demonstrates how to use the Aho-Corasick algorithm to rapidly
// scan text for matches in a large dictionary of keywords. This example by
// default reads your system's dictionary (~120,000 words).
extern crate aho_corasick;
extern crate csv;
extern crate docopt;
extern crate memmap;
extern crate serde;
#[macro_use]
extern crate serde_derive;
use std::error::Error;
use std::fs::File;
use std::io::{self, BufRead, Write};
use std::process;
use aho_corasick::{Automaton, AcAutomaton, Match};
use docopt::Docopt;
use memmap::Mmap;
static USAGE: &'static str = "
Usage: dict-search [options] <input>
dict-search --help
Options:
-d <path>, --dict <path> Path to dictionary of keywords to search.
[default: /usr/share/dict/words]
-m <len>, --min-len <len> The minimum length for a keyword in UTF-8
encoded bytes. [default: 5]
--overlapping Report overlapping matches.
-c, --count Show only the numebr of matches.
--memory-usage Show memory usage of automaton.
--full Use fully expanded transition matrix.
Warning: may use lots of memory.
-h, --help Show this usage message.
";
#[derive(Clone, Debug, Deserialize)]
struct Args {
arg_input: String,
flag_dict: String,
flag_min_len: usize,
flag_overlapping: bool,
flag_memory_usage: bool,
flag_full: bool,
flag_count: bool,
}
fn main() {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.deserialize())
.unwrap_or_else(|e| e.exit());
match run(&args) {
Ok(()) => {}
Err(err) => {
writeln!(&mut io::stderr(), "{}", err).unwrap();
process::exit(1);
}
}
}
fn run(args: &Args) -> Result<(), Box<Error>> {
let aut = try!(build_automaton(&args.flag_dict, args.flag_min_len));
if args.flag_memory_usage {
let (bytes, states) = if args.flag_full {
let aut = aut.into_full();
(aut.heap_bytes(), aut.num_states())
} else {
(aut.heap_bytes(), aut.num_states())
};
println!("{} bytes, {} states", bytes, states);
return Ok(());
}
let rdr = try!(File::open(&args.arg_input));
if args.flag_full {
let aut = aut.into_full();
if args.flag_overlapping {
if args.flag_count {
let mmap = unsafe { try!(Mmap::map(&rdr)) };
println!("{}", aut.find_overlapping(&*mmap).count());
} else {
try!(write_matches(&aut, aut.stream_find_overlapping(rdr)));
}
} else {
if args.flag_count {
let mmap = unsafe { try!(Mmap::map(&rdr)) };
println!("{}", aut.find(&*mmap).count());
} else {
try!(write_matches(&aut, aut.stream_find(rdr)));
}
}
} else {
if args.flag_overlapping {
if args.flag_count {
let mmap = unsafe { try!(Mmap::map(&rdr)) };
println!("{}", aut.find_overlapping(&*mmap).count());
} else {
try!(write_matches(&aut, aut.stream_find_overlapping(rdr)));
}
} else {
if args.flag_count {
let mmap = unsafe { try!(Mmap::map(&rdr)) };
println!("{}", aut.find(&*mmap).count());
} else {
try!(write_matches(&aut, aut.stream_find(rdr)));
}
}
}
Ok(())
}
fn write_matches<A, I>(aut: &A, it: I) -> Result<(), Box<Error>>
where A: Automaton<String>, I: Iterator<Item=io::Result<Match>> {
let mut wtr = csv::Writer::from_writer(io::stdout());
try!(wtr.serialize(("pattern", "start", "end")));
for m in it {
let m = try!(m);
try!(wtr.serialize((aut.pattern(m.pati), m.start, m.end)));
}
try!(wtr.flush());
Ok(())
}
fn build_automaton(
dict_path: &str,
min_len: usize,
) -> Result<AcAutomaton<String>, Box<Error>> {
let buf = io::BufReader::new(try!(File::open(dict_path)));
let mut lines = Vec::with_capacity(1 << 10);
for line in buf.lines() {
let line = try!(line);
if line.len() >= min_len {
lines.push(line);
}
}
Ok(AcAutomaton::with_transitions(lines))
}