| import unicodedata |
| from collections import defaultdict |
| from itertools import zip_longest |
| import warnings |
| |
| warnings.warn( |
| "The jellyfish._jellyfish module is deprecated and will be removed in jellyfish 1.0.", |
| DeprecationWarning, |
| stacklevel=2, |
| ) |
| |
| |
| def _normalize(s): |
| return unicodedata.normalize("NFKD", s) |
| |
| |
| def _check_type(s): |
| if not isinstance(s, str): |
| raise TypeError("expected str or unicode, got %s" % type(s).__name__) |
| |
| |
| def levenshtein_distance(s1, s2): |
| _check_type(s1) |
| _check_type(s2) |
| |
| if s1 == s2: |
| return 0 |
| rows = len(s1) + 1 |
| cols = len(s2) + 1 |
| |
| if not s1: |
| return cols - 1 |
| if not s2: |
| return rows - 1 |
| |
| prev = None |
| cur = range(cols) |
| for r in range(1, rows): |
| prev, cur = cur, [r] + [0] * (cols - 1) |
| for c in range(1, cols): |
| deletion = prev[c] + 1 |
| insertion = cur[c - 1] + 1 |
| edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1) |
| cur[c] = min(edit, deletion, insertion) |
| |
| return cur[-1] |
| |
| |
| def _jaro_winkler(s1, s2, long_tolerance, winklerize): |
| _check_type(s1) |
| _check_type(s2) |
| |
| s1_len = len(s1) |
| s2_len = len(s2) |
| |
| if not s1_len or not s2_len: |
| return 0.0 |
| |
| min_len = min(s1_len, s2_len) |
| search_range = max(s1_len, s2_len) |
| search_range = (search_range // 2) - 1 |
| if search_range < 0: |
| search_range = 0 |
| |
| s1_flags = [False] * s1_len |
| s2_flags = [False] * s2_len |
| |
| # looking only within search range, count & flag matched pairs |
| common_chars = 0 |
| for i, s1_ch in enumerate(s1): |
| low = max(0, i - search_range) |
| hi = min(i + search_range, s2_len - 1) |
| for j in range(low, hi + 1): |
| if not s2_flags[j] and s2[j] == s1_ch: |
| s1_flags[i] = s2_flags[j] = True |
| common_chars += 1 |
| break |
| |
| # short circuit if no characters match |
| if not common_chars: |
| return 0.0 |
| |
| # count transpositions |
| k = trans_count = 0 |
| for i, s1_f in enumerate(s1_flags): |
| if s1_f: |
| for j in range(k, s2_len): |
| if s2_flags[j]: |
| k = j + 1 |
| break |
| if s1[i] != s2[j]: |
| trans_count += 1 |
| trans_count //= 2 |
| |
| # adjust for similarities in nonmatched characters |
| common_chars = float(common_chars) |
| weight = ( |
| ( |
| common_chars / s1_len |
| + common_chars / s2_len |
| + (common_chars - trans_count) / common_chars |
| ) |
| ) / 3 |
| |
| # winkler modification: continue to boost if strings are similar |
| if winklerize and weight > 0.7: |
| # adjust for up to first 4 chars in common |
| j = min(min_len, 4) |
| i = 0 |
| while i < j and s1[i] == s2[i]: |
| i += 1 |
| if i: |
| weight += i * 0.1 * (1.0 - weight) |
| |
| # optionally adjust for long strings |
| # after agreeing beginning chars, at least two or more must agree and |
| # agreed characters must be > half of remaining characters |
| if ( |
| long_tolerance |
| and min_len > 4 |
| and common_chars > i + 1 |
| and 2 * common_chars >= min_len + i |
| ): |
| weight += (1.0 - weight) * ( |
| float(common_chars - i - 1) / float(s1_len + s2_len - i * 2 + 2) |
| ) |
| |
| return weight |
| |
| |
| def jaro_similarity(s1, s2): |
| return _jaro_winkler(s1, s2, False, False) # noqa |
| |
| |
| def jaro_winkler_similarity(s1, s2, long_tolerance=False): |
| return _jaro_winkler(s1, s2, long_tolerance, True) # noqa |
| |
| |
| def damerau_levenshtein_distance(s1, s2): |
| _check_type(s1) |
| _check_type(s2) |
| |
| len1 = len(s1) |
| len2 = len(s2) |
| infinite = len1 + len2 |
| |
| # character array |
| da = defaultdict(int) |
| |
| # distance matrix |
| score = [[0] * (len2 + 2) for x in range(len1 + 2)] |
| |
| score[0][0] = infinite |
| for i in range(0, len1 + 1): |
| score[i + 1][0] = infinite |
| score[i + 1][1] = i |
| for i in range(0, len2 + 1): |
| score[0][i + 1] = infinite |
| score[1][i + 1] = i |
| |
| for i in range(1, len1 + 1): |
| db = 0 |
| for j in range(1, len2 + 1): |
| i1 = da[s2[j - 1]] |
| j1 = db |
| cost = 1 |
| if s1[i - 1] == s2[j - 1]: |
| cost = 0 |
| db = j |
| |
| score[i + 1][j + 1] = min( |
| score[i][j] + cost, |
| score[i + 1][j] + 1, |
| score[i][j + 1] + 1, |
| score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1), |
| ) |
| da[s1[i - 1]] = i |
| |
| return score[len1 + 1][len2 + 1] |
| |
| |
| def soundex(s): |
| _check_type(s) |
| |
| if not s: |
| return "" |
| |
| s = _normalize(s) |
| s = s.upper() |
| |
| replacements = ( |
| ("BFPV", "1"), |
| ("CGJKQSXZ", "2"), |
| ("DT", "3"), |
| ("L", "4"), |
| ("MN", "5"), |
| ("R", "6"), |
| ) |
| result = [s[0]] |
| count = 1 |
| |
| # find would-be replacement for first character |
| for lset, sub in replacements: |
| if s[0] in lset: |
| last = sub |
| break |
| else: |
| last = None |
| |
| for letter in s[1:]: |
| for lset, sub in replacements: |
| if letter in lset: |
| if sub != last: |
| result.append(sub) |
| count += 1 |
| last = sub |
| break |
| else: |
| if letter != "H" and letter != "W": |
| # leave last alone if middle letter is H or W |
| last = None |
| if count == 4: |
| break |
| |
| result += "0" * (4 - count) |
| return "".join(result) |
| |
| |
| def hamming_distance(s1, s2): |
| _check_type(s1) |
| _check_type(s2) |
| |
| # ensure length of s1 >= s2 |
| if len(s2) > len(s1): |
| s1, s2 = s2, s1 |
| |
| # distance is difference in length + differing chars |
| distance = len(s1) - len(s2) |
| for i, c in enumerate(s2): |
| if c != s1[i]: |
| distance += 1 |
| |
| return distance |
| |
| |
| def nysiis(s): |
| _check_type(s) |
| |
| if not s: |
| return "" |
| |
| s = s.upper() |
| key = [] |
| |
| # step 1 - prefixes |
| if s.startswith("MAC"): |
| s = "MCC" + s[3:] |
| elif s.startswith("KN"): |
| s = s[1:] |
| elif s.startswith("K"): |
| s = "C" + s[1:] |
| elif s.startswith(("PH", "PF")): |
| s = "FF" + s[2:] |
| elif s.startswith("SCH"): |
| s = "SSS" + s[3:] |
| |
| # step 2 - suffixes |
| if s.endswith(("IE", "EE")): |
| s = s[:-2] + "Y" |
| elif s.endswith(("DT", "RT", "RD", "NT", "ND")): |
| s = s[:-2] + "D" |
| |
| # step 3 - first character of key comes from name |
| key.append(s[0]) |
| |
| # step 4 - translate remaining chars |
| i = 1 |
| len_s = len(s) |
| while i < len_s: |
| ch = s[i] |
| if ch == "E" and i + 1 < len_s and s[i + 1] == "V": |
| ch = "AF" |
| i += 1 |
| elif ch in "AEIOU": |
| ch = "A" |
| elif ch == "Q": |
| ch = "G" |
| elif ch == "Z": |
| ch = "S" |
| elif ch == "M": |
| ch = "N" |
| elif ch == "K": |
| if i + 1 < len(s) and s[i + 1] == "N": |
| ch = "N" |
| else: |
| ch = "C" |
| elif ch == "S" and s[i + 1 : i + 3] == "CH": |
| ch = "SS" |
| i += 2 |
| elif ch == "P" and i + 1 < len(s) and s[i + 1] == "H": |
| ch = "F" |
| i += 1 |
| elif ch == "H" and ( |
| s[i - 1] not in "AEIOU" |
| or (i + 1 < len(s) and s[i + 1] not in "AEIOU") |
| or (i + 1 == len(s)) |
| ): |
| if s[i - 1] in "AEIOU": |
| ch = "A" |
| else: |
| ch = s[i - 1] |
| elif ch == "W" and s[i - 1] in "AEIOU": |
| ch = s[i - 1] |
| |
| if ch[-1] != key[-1][-1]: |
| key.append(ch) |
| |
| i += 1 |
| |
| key = "".join(key) |
| |
| # step 5 - remove trailing S |
| if key.endswith("S") and key != "S": |
| key = key[:-1] |
| |
| # step 6 - replace AY w/ Y |
| if key.endswith("AY"): |
| key = key[:-2] + "Y" |
| |
| # step 7 - remove trailing A |
| if key.endswith("A") and key != "A": |
| key = key[:-1] |
| |
| # step 8 was already done |
| |
| return key |
| |
| |
| def match_rating_codex(s): |
| _check_type(s) |
| |
| # we ignore spaces |
| s = s.upper().replace(" ", "") |
| codex = [] |
| |
| prev = None |
| first = True |
| for c in s: |
| # starting character |
| # or consonant not preceded by same consonant |
| if first or (c not in "AEIOU" and c != prev): |
| codex.append(c) |
| |
| prev = c |
| first = False |
| |
| # just use first/last 3 |
| if len(codex) > 6: |
| return "".join(codex[:3] + codex[-3:]) |
| else: |
| return "".join(codex) |
| |
| |
| def match_rating_comparison(s1, s2): |
| codex1 = match_rating_codex(s1) |
| codex2 = match_rating_codex(s2) |
| len1 = len(codex1) |
| len2 = len(codex2) |
| res1 = [] |
| res2 = [] |
| |
| # length differs by 3 or more, no result |
| if abs(len1 - len2) >= 3: |
| return None |
| |
| # get minimum rating based on sums of codexes |
| lensum = len1 + len2 |
| if lensum <= 4: |
| min_rating = 5 |
| elif lensum <= 7: |
| min_rating = 4 |
| elif lensum <= 11: |
| min_rating = 3 |
| else: |
| min_rating = 2 |
| |
| # strip off common prefixes |
| for c1, c2 in zip_longest(codex1, codex2): |
| if c1 != c2: |
| if c1: |
| res1.append(c1) |
| if c2: |
| res2.append(c2) |
| |
| unmatched_count1 = unmatched_count2 = 0 |
| for c1, c2 in zip_longest(reversed(res1), reversed(res2)): |
| if c1 != c2: |
| if c1: |
| unmatched_count1 += 1 |
| if c2: |
| unmatched_count2 += 1 |
| |
| return (6 - max(unmatched_count1, unmatched_count2)) >= min_rating |
| |
| |
| def metaphone(s): |
| _check_type(s) |
| |
| result = [] |
| |
| s = _normalize(s.lower()) |
| |
| # skip first character if s starts with these |
| if s.startswith(("kn", "gn", "pn", "wr", "ae")): |
| s = s[1:] |
| |
| i = 0 |
| |
| while i < len(s): |
| c = s[i] |
| next = s[i + 1] if i < len(s) - 1 else "*****" |
| nextnext = s[i + 2] if i < len(s) - 2 else "*****" |
| |
| # skip doubles except for cc |
| if c == next and c != "c": |
| i += 1 |
| continue |
| |
| if c in "aeiou": |
| if i == 0 or s[i - 1] == " ": |
| result.append(c) |
| elif c == "b": |
| if (not (i != 0 and s[i - 1] == "m")) or next: |
| result.append("b") |
| elif c == "c": |
| if next == "i" and nextnext == "a" or next == "h": |
| result.append("x") |
| i += 1 |
| elif next in "iey": |
| result.append("s") |
| i += 1 |
| else: |
| result.append("k") |
| elif c == "d": |
| if next == "g" and nextnext in "iey": |
| result.append("j") |
| i += 2 |
| else: |
| result.append("t") |
| elif c in "fjlmnr": |
| result.append(c) |
| elif c == "g": |
| if next in "iey": |
| result.append("j") |
| elif next == "h" and nextnext and nextnext not in "aeiou": |
| i += 1 |
| elif next == "n" and not nextnext: |
| i += 1 |
| else: |
| result.append("k") |
| elif c == "h": |
| if i == 0 or next in "aeiou" or s[i - 1] not in "aeiou": |
| result.append("h") |
| elif c == "k": |
| if i == 0 or s[i - 1] != "c": |
| result.append("k") |
| elif c == "p": |
| if next == "h": |
| result.append("f") |
| i += 1 |
| else: |
| result.append("p") |
| elif c == "q": |
| result.append("k") |
| elif c == "s": |
| if next == "h": |
| result.append("x") |
| i += 1 |
| elif next == "i" and nextnext in "oa": |
| result.append("x") |
| i += 2 |
| else: |
| result.append("s") |
| elif c == "t": |
| if next == "i" and nextnext in "oa": |
| result.append("x") |
| elif next == "h": |
| result.append("0") |
| i += 1 |
| elif next != "c" or nextnext != "h": |
| result.append("t") |
| elif c == "v": |
| result.append("f") |
| elif c == "w": |
| if i == 0 and next == "h": |
| i += 1 |
| result.append("w") |
| elif next in "aeiou": |
| result.append("w") |
| elif c == "x": |
| if i == 0: |
| if next == "h" or (next == "i" and nextnext in "oa"): |
| result.append("x") |
| else: |
| result.append("s") |
| else: |
| result.append("k") |
| result.append("s") |
| elif c == "y": |
| if next in "aeiou": |
| result.append("y") |
| elif c == "z": |
| result.append("s") |
| elif c == " ": |
| if len(result) > 0 and result[-1] != " ": |
| result.append(" ") |
| |
| i += 1 |
| |
| return "".join(result).upper() |