blob: b7bd9a4ec6265214772a5207b34c64e9bee6b7b2 [file] [log] [blame]
from _jellyfish import (metaphone, match_rating_codex, porter_stem,
soundex, hamming_distance, match_rating_comparison,
jaro_distance, jaro_winkler, levenshtein_distance,
damerau_levenshtein_distance)
from .compat import _no_bytes_err
cpdef nysiis(unicode s):
if isinstance(s, bytes):
raise TypeError(_no_bytes_err)
if not s:
return ''
s = s.upper()
key = []
# step 1 - prefixes
if s.startswith('MAC'):
s = 'MCC' + s[3:]
elif s.startswith('KN'):
s = s[1:]
elif s.startswith('K'):
s = 'C' + s[1:]
elif s.startswith(('PH', 'PF')):
s = 'FF' + s[2:]
elif s.startswith('SCH'):
s = 'SSS' + s[3:]
# step 2 - suffixes
if s.endswith(('IE', 'EE')):
s = s[:-2] + 'Y'
elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
s = s[:-2] + 'D'
# step 3 - first character of key comes from name
key.append(s[0])
# step 4 - translate remaining chars
cdef int i = 1
len_s = len(s)
while i < len_s:
ch = s[i]
if ch == 'E' and i+1 < len_s and s[i+1] == u'V':
ch = 'AF'
i += 1
elif ch in 'AEIOU':
ch = 'A'
elif ch == 'Q':
ch = 'G'
elif ch == 'Z':
ch = 'S'
elif ch == 'M':
ch = 'N'
elif ch == 'K':
if i+1 < len_s and s[i+1] == u'N':
ch = 'N'
else:
ch = 'C'
elif ch == 'S' and s[i+1:i+3] == 'CH':
ch = 'SS'
i += 2
elif ch == 'P' and i+1 < len_s and s[i+1] == u'H':
ch = 'F'
i += 1
elif ch == 'H' and (s[i-1] not in 'AEIOU' or (i+1 < len_s and s[i+1] not in 'AEIOU')):
if s[i-1] in 'AEIOU':
ch = 'A'
else:
ch = s[i-1]
elif ch == 'W' and s[i-1] in 'AEIOU':
ch = s[i-1]
if ch[-1] != key[-1][-1]:
key.append(ch)
i += 1
key = ''.join(key)
# step 5 - remove trailing S
if key.endswith('S') and key != 'S':
key = key[:-1]
# step 6 - replace AY w/ Y
if key.endswith('AY'):
key = key[:-2] + 'Y'
# step 7 - remove trailing A
if key.endswith('A') and key != 'A':
key = key[:-1]
# step 8 was already done
return key