blob: 8c56843b3c314a18060c889a6481f249984d3cfa [file] [log] [blame]
//! This library implements string similarity metrics.
use std::char;
use std::cmp::{max, min};
use std::collections::HashMap;
#[derive(Debug, PartialEq)]
pub enum StrSimError {
DifferentLengthArgs
}
pub type HammingResult = Result<usize, StrSimError>;
/// Calculates the number of positions in the two strings where the characters
/// differ. Returns an error if the strings have different lengths.
///
/// ```
/// use strsim::hamming;
///
/// match hamming("hamming", "hammers") {
/// Ok(distance) => assert_eq!(3, distance),
/// Err(why) => panic!("{:?}", why)
/// }
/// ```
pub fn hamming(a: &str, b: &str) -> HammingResult {
let (mut ita, mut itb, mut count) = (a.chars(), b.chars(), 0);
loop {
match (ita.next(), itb.next()){
(Some(x), Some(y)) => if x != y { count += 1 },
(None, None) => return Ok(count),
_ => return Err(StrSimError::DifferentLengthArgs),
}
}
}
/// Calculates the Jaro similarity between two strings. The returned value
/// is between 0.0 and 1.0 (higher value means more similar).
///
/// ```
/// use strsim::jaro;
///
/// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
/// 0.001);
/// ```
pub fn jaro(a: &str, b: &str) -> f64 {
if a == b { return 1.0; }
let a_len = a.chars().count();
let b_len = b.chars().count();
// The check for lengths of one here is to prevent integer overflow when
// calculating the search range.
if a_len == 0 || b_len == 0 || (a_len == 1 && b_len == 1) {
return 0.0;
}
let search_range = (max(a_len, b_len) / 2) - 1;
let mut b_consumed = Vec::with_capacity(b_len);
for _ in 0..b_len {
b_consumed.push(false);
}
let mut matches = 0.0;
let mut transpositions = 0.0;
let mut b_match_index = 0;
for (i, a_char) in a.chars().enumerate() {
let min_bound =
// prevent integer wrapping
if i > search_range {
max(0, i - search_range)
} else {
0
};
let max_bound = min(b_len - 1, i + search_range);
if min_bound > max_bound {
continue;
}
for (j, b_char) in b.chars().enumerate() {
if min_bound <= j && j <= max_bound && a_char == b_char &&
!b_consumed[j] {
b_consumed[j] = true;
matches += 1.0;
if j < b_match_index {
transpositions += 1.0;
}
b_match_index = j;
break;
}
}
}
if matches == 0.0 {
0.0
} else {
(1.0 / 3.0) * ((matches / a_len as f64) +
(matches / b_len as f64) +
((matches - transpositions) / matches))
}
}
/// Like Jaro but gives a boost to strings that have a common prefix.
///
/// ```
/// use strsim::jaro_winkler;
///
/// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
/// 0.001);
/// ```
pub fn jaro_winkler(a: &str, b: &str) -> f64 {
let jaro_distance = jaro(a, b);
// Don't limit the length of the common prefix
let prefix_length = a.chars()
.zip(b.chars())
.take_while(|&(a_char, b_char)| a_char == b_char)
.count();
let jaro_winkler_distance =
jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance));
if jaro_winkler_distance <= 1.0 {
jaro_winkler_distance
} else {
1.0
}
}
/// Calculates the minimum number of insertions, deletions, and substitutions
/// required to change one string into the other.
///
/// ```
/// use strsim::levenshtein;
///
/// assert_eq!(3, levenshtein("kitten", "sitting"));
/// ```
pub fn levenshtein(a: &str, b: &str) -> usize {
if a == b { return 0; }
let a_len = a.chars().count();
let b_len = b.chars().count();
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }
let mut cache: Vec<usize> = (1..b_len+1).collect();
let mut result = 0;
let mut distance_a;
let mut distance_b;
for (i, a_char) in a.chars().enumerate() {
result = i;
distance_b = i;
for (j, b_char) in b.chars().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
distance_a = distance_b + cost;
distance_b = cache[j];
result = min(result + 1, min(distance_a, distance_b + 1));
cache[j] = result;
}
}
result
}
/// Calculates a normalized score of the Levenshtein algorithm between 0.0 and
/// 1.0 (inclusive), where 1.0 means the strings are the same.
///
/// ```
/// use strsim::normalized_levenshtein;
///
/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
/// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
/// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
/// ```
pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
/// Like Levenshtein but allows for adjacent transpositions. Each substring can
/// only be edited once.
///
/// ```
/// use strsim::osa_distance;
///
/// assert_eq!(3, osa_distance("ab", "bca"));
/// ```
pub fn osa_distance(a: &str, b: &str) -> usize {
let a_len = a.chars().count();
let b_len = b.chars().count();
if a == b { return 0; }
else if a_len == 0 { return b_len; }
else if b_len == 0 { return a_len; }
let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
let mut prev_a_char = char::MAX;
let mut prev_b_char = char::MAX;
for i in 0..(b_len + 1) {
prev_two_distances.push(i);
prev_distances.push(i);
curr_distances.push(0);
}
for (i, a_char) in a.chars().enumerate() {
curr_distances[0] = i + 1;
for (j, b_char) in b.chars().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
curr_distances[j + 1] = min(curr_distances[j] + 1,
min(prev_distances[j + 1] + 1,
prev_distances[j] + cost));
if i > 0 && j > 0 && a_char != b_char &&
a_char == prev_b_char && b_char == prev_a_char {
curr_distances[j + 1] = min(curr_distances[j + 1],
prev_two_distances[j - 1] + 1);
}
prev_b_char = b_char;
}
prev_two_distances.clone_from(&prev_distances);
prev_distances.clone_from(&curr_distances);
prev_a_char = a_char;
}
curr_distances[b_len]
}
/// Like optimal string alignment, but substrings can be edited an unlimited
/// number of times, and the triangle inequality holds.
///
/// ```
/// use strsim::damerau_levenshtein;
///
/// assert_eq!(2, damerau_levenshtein("ab", "bca"));
/// ```
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
if a == b { return 0; }
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }
let mut distances = vec![vec![0; b_len + 2]; a_len + 2];
let max_distance = a_len + b_len;
distances[0][0] = max_distance;
for i in 0..(a_len + 1) {
distances[i + 1][0] = max_distance;
distances[i + 1][1] = i;
}
for j in 0..(b_len + 1) {
distances[0][j + 1] = max_distance;
distances[1][j + 1] = j;
}
let mut chars: HashMap<char, usize> = HashMap::new();
for i in 1..(a_len + 1) {
let mut db = 0;
for j in 1..(b_len + 1) {
let k = match chars.get(&b_chars[j - 1]) {
Some(value) => value.clone(),
None => 0
};
let l = db;
let mut cost = 1;
if a_chars[i - 1] == b_chars[j - 1] {
cost = 0;
db = j;
}
let substitution_cost = distances[i][j] + cost;
let insertion_cost = distances[i][j + 1] + 1;
let deletion_cost = distances[i + 1][j] + 1;
let transposition_cost = distances[k][l] + (i - k - 1) + 1 +
(j - l - 1);
distances[i + 1][j + 1] = min(substitution_cost,
min(insertion_cost,
min(deletion_cost,
transposition_cost)));
}
chars.insert(a_chars[i - 1], i);
}
distances[a_len + 1][b_len + 1]
}
/// Calculates a normalized score of the Damerau–Levenshtein algorithm between
/// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same.
///
/// ```
/// use strsim::normalized_damerau_levenshtein;
///
/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
/// ```
pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn hamming_empty() {
match hamming("", "") {
Ok(distance) => { assert_eq!(0, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_same() {
match hamming("hamming", "hamming") {
Ok(distance) => { assert_eq!(0, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_diff() {
match hamming("hamming", "hammers") {
Ok(distance) => { assert_eq!(3, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_diff_multibyte() {
match hamming("hamming", "h香mmüng") {
Ok(distance) => { assert_eq!(2, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_unequal_length() {
match hamming("ham", "hamming") {
Ok(_) => { panic!(); },
Err(why) => { assert_eq!(why, StrSimError::DifferentLengthArgs); }
}
}
#[test]
fn hamming_names() {
match hamming("Friedrich Nietzs", "Jean-Paul Sartre") {
Ok(distance) => { assert_eq!(14, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn jaro_both_empty() {
assert_eq!(1.0, jaro("", ""));
}
#[test]
fn jaro_first_empty() {
assert_eq!(0.0, jaro("", "jaro"));
}
#[test]
fn jaro_second_empty() {
assert_eq!(0.0, jaro("distance", ""));
}
#[test]
fn jaro_same() {
assert_eq!(1.0, jaro("jaro", "jaro"));
}
#[test]
fn jaro_multibyte() {
assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001);
assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001);
}
#[test]
fn jaro_diff_short() {
assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001);
}
#[test]
fn jaro_diff_one_character() {
assert_eq!(0.0, jaro("a", "b"));
}
#[test]
fn jaro_diff_one_and_two() {
assert!((0.83 - jaro("a", "ab")).abs() < 0.01);
}
#[test]
fn jaro_diff_two_and_one() {
assert!((0.83 - jaro("ab", "a")).abs() < 0.01);
}
#[test]
fn jaro_diff_no_transposition() {
assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001);
}
#[test]
fn jaro_diff_with_transposition() {
assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001);
}
#[test]
fn jaro_names() {
assert!((0.392 - jaro("Friedrich Nietzsche",
"Jean-Paul Sartre")).abs() < 0.001);
}
#[test]
fn jaro_winkler_both_empty() {
assert_eq!(1.0, jaro_winkler("", ""));
}
#[test]
fn jaro_winkler_first_empty() {
assert_eq!(0.0, jaro_winkler("", "jaro-winkler"));
}
#[test]
fn jaro_winkler_second_empty() {
assert_eq!(0.0, jaro_winkler("distance", ""));
}
#[test]
fn jaro_winkler_same() {
assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler"));
}
#[test]
fn jaro_winkler_multibyte() {
assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() <
0.001);
assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() <
0.001);
}
#[test]
fn jaro_winkler_diff_short() {
assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001);
assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001);
}
#[test]
fn jaro_winkler_diff_one_character() {
assert_eq!(0.0, jaro_winkler("a", "b"));
}
#[test]
fn jaro_winkler_diff_no_transposition() {
assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001);
}
#[test]
fn jaro_winkler_diff_with_transposition() {
assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001);
}
#[test]
fn jaro_winkler_names() {
assert!((0.562 - jaro_winkler("Friedrich Nietzsche",
"Fran-Paul Sartre")).abs() < 0.001);
}
#[test]
fn jaro_winkler_long_prefix() {
assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
0.001);
}
#[test]
fn jaro_winkler_more_names() {
assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001);
}
#[test]
fn jaro_winkler_length_of_one() {
assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001);
}
#[test]
fn jaro_winkler_very_long_prefix() {
assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx",
"thequickbrownfoxjumpedovery")).abs() <
0.001);
}
#[test]
fn levenshtein_empty() {
assert_eq!(0, levenshtein("", ""));
}
#[test]
fn levenshtein_same() {
assert_eq!(0, levenshtein("levenshtein", "levenshtein"));
}
#[test]
fn levenshtein_diff_short() {
assert_eq!(3, levenshtein("kitten", "sitting"));
}
#[test]
fn levenshtein_diff_with_space() {
assert_eq!(5, levenshtein("hello, world", "bye, world"));
}
#[test]
fn levenshtein_diff_multibyte() {
assert_eq!(3, levenshtein("öঙ香", "abc"));
assert_eq!(3, levenshtein("abc", "öঙ香"));
}
#[test]
fn levenshtein_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(37, levenshtein(a, b));
}
#[test]
fn levenshtein_first_empty() {
assert_eq!(7, levenshtein("", "sitting"));
}
#[test]
fn levenshtein_second_empty() {
assert_eq!(6, levenshtein("kitten", ""));
}
#[test]
fn normalized_levenshtein_diff_short() {
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_for_empty_strings() {
assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_first_empty() {
assert!(normalized_levenshtein("", "second").abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_second_empty() {
assert!(normalized_levenshtein("first", "").abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_identical_strings() {
assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001);
}
#[test]
fn osa_distance_empty() {
assert_eq!(0, osa_distance("", ""));
}
#[test]
fn osa_distance_same() {
assert_eq!(0, osa_distance("damerau", "damerau"));
}
#[test]
fn osa_distance_first_empty() {
assert_eq!(7, osa_distance("", "damerau"));
}
#[test]
fn osa_distance_second_empty() {
assert_eq!(7, osa_distance("damerau", ""));
}
#[test]
fn osa_distance_diff() {
assert_eq!(3, osa_distance("ca", "abc"));
}
#[test]
fn osa_distance_diff_short() {
assert_eq!(3, osa_distance("damerau", "aderua"));
}
#[test]
fn osa_distance_diff_reversed() {
assert_eq!(3, osa_distance("aderua", "damerau"));
}
#[test]
fn osa_distance_diff_multibyte() {
assert_eq!(3, osa_distance("öঙ香", "abc"));
assert_eq!(3, osa_distance("abc", "öঙ香"));
}
#[test]
fn osa_distance_diff_unequal_length() {
assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
}
#[test]
fn osa_distance_diff_unequal_length_reversed() {
assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
}
#[test]
fn osa_distance_diff_comedians() {
assert_eq!(5, osa_distance("Stewart", "Colbert"));
}
#[test]
fn osa_distance_many_transpositions() {
assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
}
#[test]
fn osa_distance_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(36, osa_distance(a, b));
}
#[test]
fn osa_distance_beginning_transposition() {
assert_eq!(1, osa_distance("foobar", "ofobar"));
}
#[test]
fn osa_distance_end_transposition() {
assert_eq!(1, osa_distance("specter", "spectre"));
}
#[test]
fn osa_distance_restricted_edit() {
assert_eq!(4, osa_distance("a cat", "an abct"));
}
#[test]
fn damerau_levenshtein_empty() {
assert_eq!(0, damerau_levenshtein("", ""));
}
#[test]
fn damerau_levenshtein_same() {
assert_eq!(0, damerau_levenshtein("damerau", "damerau"));
}
#[test]
fn damerau_levenshtein_first_empty() {
assert_eq!(7, damerau_levenshtein("", "damerau"));
}
#[test]
fn damerau_levenshtein_second_empty() {
assert_eq!(7, damerau_levenshtein("damerau", ""));
}
#[test]
fn damerau_levenshtein_diff() {
assert_eq!(2, damerau_levenshtein("ca", "abc"));
}
#[test]
fn damerau_levenshtein_diff_short() {
assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
}
#[test]
fn damerau_levenshtein_diff_reversed() {
assert_eq!(3, damerau_levenshtein("aderua", "damerau"));
}
#[test]
fn damerau_levenshtein_diff_multibyte() {
assert_eq!(3, damerau_levenshtein("öঙ香", "abc"));
assert_eq!(3, damerau_levenshtein("abc", "öঙ香"));
}
#[test]
fn damerau_levenshtein_diff_unequal_length() {
assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz"));
}
#[test]
fn damerau_levenshtein_diff_unequal_length_reversed() {
assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau"));
}
#[test]
fn damerau_levenshtein_diff_comedians() {
assert_eq!(5, damerau_levenshtein("Stewart", "Colbert"));
}
#[test]
fn damerau_levenshtein_many_transpositions() {
assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk"));
}
#[test]
fn damerau_levenshtein_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(36, damerau_levenshtein(a, b));
}
#[test]
fn damerau_levenshtein_beginning_transposition() {
assert_eq!(1, damerau_levenshtein("foobar", "ofobar"));
}
#[test]
fn damerau_levenshtein_end_transposition() {
assert_eq!(1, damerau_levenshtein("specter", "spectre"));
}
#[test]
fn damerau_levenshtein_unrestricted_edit() {
assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
}
#[test]
fn normalized_damerau_levenshtein_diff_short() {
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_for_empty_strings() {
assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_first_empty() {
assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_second_empty() {
assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_identical_strings() {
assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
}
}