blob: 0946c09dbc326b97c2ea16e1001e03b431a96ba1 [file] [log] [blame]
# -*- coding: utf-8 -*-
import csv
import unittest
import jellyfish
class JellyfishTestCase(unittest.TestCase):
def test_jaro_winkler(self):
cases = [("dixon", "dicksonx", 0.8133),
("dixon", "dicksonx", 0.8133),
("martha", "marhta", 0.9611),
("dwayne", "duane", 0.84),
]
for (s1, s2, value) in cases:
self.assertAlmostEqual(jellyfish.jaro_winkler(s1, s2), value,
places=4)
def test_jaro_distance(self):
cases = [("dixon", "dicksonx", 0.767),
("dixon", "dicksonx", 0.767),
("martha", "marhta", 0.944),
("dwayne", "duane", 0.822),
]
for (s1, s2, value) in cases:
self.assertAlmostEqual(jellyfish.jaro_distance(s1, s2), value,
places=3)
def test_hamming_distance(self):
cases = [("", "", 0),
("", "abc", 3),
("abc", "abc", 0),
("acc", "abc", 1),
("abcd", "abc", 1),
("abc", "abcd", 1),
("testing", "this is a test", 13),
]
for (s1, s2, value) in cases:
self.assertEqual(jellyfish.hamming_distance(s1, s2), value)
def test_levenshtein_distance(self):
cases = [("", "", 0),
("abc", "", 3),
("bc", "abc", 1),
("kitten", "sitting", 3),
("Saturday", "Sunday", 3),
]
for (s1, s2, value) in cases:
self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value)
def test_damerau_levenshtein_distance(self):
cases = [("", "", 0),
("abc", "", 3),
("bc", "abc", 1),
("abc", "acb", 1),
]
for (s1, s2, value) in cases:
self.assertEqual(jellyfish.damerau_levenshtein_distance(s1, s2),
value)
def test_soundex(self):
cases = [("Washington", "W252"),
("Lee", "L000"),
("Gutierrez", "G362"),
("Pfister", "P236"),
("Jackson", "J250"),
("Tymczak", "T522"),
("", ""),
("A", "A000"),
(u"Çáŕẗéř", "C636"),
]
for (s1, code) in cases:
self.assertEqual(jellyfish.soundex(s1), code)
def test_metaphone(self):
cases = [("metaphone", 'MTFN'),
("wHErE", "WR"),
("shell", "XL"),
("this is a difficult string", "0S IS A TFKLT STRNK"),
("aeromancy", "ERMNS"),
("Antidisestablishmentarianism", "ANTTSSTBLXMNTRNSM"),
("sunlight labs", "SNLT LBS"),
("sonlite laabz", "SNLT LBS"),
(u"Çáŕẗéř", "KRTR"),
('kentucky', 'KNTK'),
('KENTUCKY', 'KNTK'),
]
for (s1, code) in cases:
self.assertEqual(jellyfish.metaphone(s1), code)
def test_nysiis(self):
cases = [("Worthy", "WARTY"),
("Ogata", "OGAT"),
("montgomery", "MANTGANARY"),
("Costales", "CASTAL"),
("Tu", "T"),
("martincevic", "MARTANCAFAC"),
("Catherine", "CATARAN"),
("Katherine", "CATARAN"),
("Katerina", "CATARAN"),
("Johnathan", "JANATAN"),
("Jonathan", "JANATAN"),
("John", "JAN"),
("Teresa", "TARAS"),
("Theresa", "TARAS"),
("Jessica", "JASAC"),
("Joshua", "JAS"),
("Bosch", "BAS"),
("Lapher", "LAFAR"),
]
for (s1, s2) in cases:
self.assertEqual(jellyfish.nysiis(s1), s2)
def test_match_rating_codex(self):
cases = [("Byrne", "BYRN"),
("Boern", "BRN"),
("Smith", "SMTH"),
("Smyth", "SMYTH"),
("Catherine", "CTHRN"),
("Kathryn", "KTHRYN"),
]
for (s1, s2) in cases:
self.assertEqual(jellyfish.match_rating_codex(s1), s2)
def test_match_rating_comparison(self):
cases = [("Bryne", "Boern", True),
("Smith", "Smyth", True),
("Catherine", "Kathryn", True),
("Michael", "Mike", False),
]
for (s1, s2, value) in cases:
self.assertEqual(jellyfish.match_rating_comparison(s1, s2), value)
def test_match_rating_comparison_segfault(self):
import hashlib
sha1s = [hashlib.sha1(str(v).encode('ascii')).hexdigest() for v in range(100)]
# this segfaulted on 0.1.2
assert [[jellyfish.match_rating_comparison(h1, h2) for h1 in sha1s]
for h2 in sha1s]
def test_porter_stem(self):
with open('porter-test.csv') as f:
reader = csv.reader(f)
for (a, b) in reader:
self.assertEqual(jellyfish.porter_stem(a.lower()), b.lower())
if __name__ == '__main__':
unittest.main()