| /*************************************************************************/ |
| /* */ |
| /* Language Technologies Institute */ |
| /* Carnegie Mellon University */ |
| /* Copyright (c) 2001 */ |
| /* All Rights Reserved. */ |
| /* */ |
| /* Permission is hereby granted, free of charge, to use and distribute */ |
| /* this software and its documentation without restriction, including */ |
| /* without limitation the rights to use, copy, modify, merge, publish, */ |
| /* distribute, sublicense, and/or sell copies of this work, and to */ |
| /* permit persons to whom this work is furnished to do so, subject to */ |
| /* the following conditions: */ |
| /* 1. The code must retain the above copyright notice, this list of */ |
| /* conditions and the following disclaimer. */ |
| /* 2. Any modifications must be clearly marked as such. */ |
| /* 3. Original authors' names are not deleted. */ |
| /* 4. The authors' names are not used to endorse or promote products */ |
| /* derived from this software without specific prior written */ |
| /* permission. */ |
| /* */ |
| /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
| /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
| /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
| /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
| /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
| /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
| /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
| /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
| /* THIS SOFTWARE. */ |
| /* */ |
| /*************************************************************************/ |
| /* Author: Alan W Black (awb@cs.cmu.edu) */ |
| /* Date: January 2001 */ |
| /*************************************************************************/ |
| /* */ |
| /* CMU Lexicon definition */ |
| /* */ |
| /*************************************************************************/ |
| |
| #include "flite.h" |
| |
| #include "cmu_lex.h" |
| |
| extern const int cmu_lex_entry[]; |
| extern const unsigned char cmu_lex_data[]; |
| extern const int cmu_lex_num_entries; |
| extern const int cmu_lex_num_bytes; |
| extern const char * const cmu_lex_phone_table[54]; |
| extern const char * const cmu_lex_phones_huff_table[]; |
| extern const char * const cmu_lex_entries_huff_table[]; |
| |
| static int cmu_is_vowel(const char *p); |
| static int cmu_is_silence(const char *p); |
| static int cmu_has_vowel_in_list(const cst_val *v); |
| static int cmu_has_vowel_in_syl(const cst_item *i); |
| static int cmu_sonority(const char *p); |
| |
| static const char * const addenda0[] = { "p,", NULL }; |
| static const char * const addenda1[] = { "p.", NULL }; |
| static const char * const addenda2[] = { "p(", NULL }; |
| static const char * const addenda3[] = { "p)", NULL }; |
| static const char * const addenda4[] = { "p[", NULL }; |
| static const char * const addenda5[] = { "p]", NULL }; |
| static const char * const addenda6[] = { "p{", NULL }; |
| static const char * const addenda7[] = { "p}", NULL }; |
| static const char * const addenda8[] = { "p:", NULL }; |
| static const char * const addenda9[] = { "p;", NULL }; |
| static const char * const addenda10[] = { "p?", NULL}; |
| static const char * const addenda11[] = { "p!", NULL }; |
| static const char * const addenda12[] = { "n@", "ae1", "t", NULL }; |
| static const char * const addenda13[] = { "n#", "hh", "ae1","sh", NULL }; |
| static const char * const addenda14[] = { "n$", "d", "aa1", "l", "er", NULL }; |
| static const char * const addenda15[] = { "n%", "p", "er", "s", "eh1", "n", "t", NULL }; |
| static const char * const addenda16[] = { "n^", "k", "eh1", "r", "eh1", "t", NULL }; |
| static const char * const addenda17[] = { "n&","ae1","m","p","er","s","ae1","n","d", NULL }; |
| static const char * const addenda18[] = { "n*","ae1","s","t","er","ih1","s","k",NULL }; |
| static const char * const addenda19[] = { "n|","b","aa1","r",NULL }; |
| static const char * const addenda20[] = { "n\\","b","ae1","k","s","l","ae1","sh",NULL }; |
| static const char * const addenda21[] = { "n=","iy1","k","w","ax","l","z",NULL}; |
| static const char * const addenda22[] = { "n+","p","l","ah1","s",NULL}; |
| static const char * const addenda23[] = { "n~","t","ih1","l","d","ax",NULL}; |
| static const char * const addenda24[] = { "p'",NULL}; |
| static const char * const addenda25[] = { "p`",NULL}; |
| static const char * const addenda26[] = { "p\"",NULL}; |
| static const char * const addenda27[] = { "p-",NULL}; |
| static const char * const addenda28[] = { "p<",NULL}; |
| static const char * const addenda29[] = { "p>",NULL}; |
| static const char * const addenda30[] = { "n_","ah1","n","d","er","s","k","ao1","r",NULL}; |
| static const char * const addenda31[] = { "s's","z",NULL}; |
| static const char * const addenda32[] = { "nim","ay1","m",NULL}; |
| static const char * const addenda33[] = { "vdoesnt","d","ah1","z","n","t",NULL}; |
| static const char * const addenda34[] = { "vyoull","y","uw1","l",NULL}; |
| static const char * const addenda35[] = { "n/","s","l","ae1","sh",NULL}; |
| |
| static const char * const addenda36[] = { "nin","ih","n",NULL}; |
| static const char * const addenda37[] = { "nto","t","ax",NULL}; |
| static const char * const addenda38[] = { "0_a","ey",NULL}; |
| static const char * const addenda39[] = { "vhavent","hh","ae1","v","ax","n","t",NULL}; |
| static const char * const addenda40[] = { "nemail","iy1","m","ey1","l",NULL}; |
| static const char * const addenda41[] = { "nshit","sh","ih1","t",NULL}; |
| |
| static const char * const * const addenda[] = { |
| addenda0, |
| addenda1, |
| addenda2, |
| addenda3, |
| addenda4, |
| addenda5, |
| addenda6, |
| addenda7, |
| addenda8, |
| addenda9, |
| addenda10, |
| addenda11, |
| addenda12, |
| addenda13, |
| addenda14, |
| addenda15, |
| addenda16, |
| addenda17, |
| addenda18, |
| addenda19, |
| addenda20, |
| addenda21, |
| addenda22, |
| addenda23, |
| addenda24, |
| addenda25, |
| addenda26, |
| addenda27, |
| addenda28, |
| addenda29, |
| addenda30, |
| addenda31, |
| addenda32, |
| addenda33, |
| addenda34, |
| addenda35, |
| |
| addenda36, |
| addenda37, |
| addenda38, |
| addenda39, |
| addenda40, |
| addenda41, |
| NULL }; |
| |
| static int cmu_is_silence(const char *p) |
| { |
| if (cst_streq(p,"pau")) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| |
| static int cmu_has_vowel_in_list(const cst_val *v) |
| { |
| const cst_val *t; |
| |
| for (t=v; t; t=val_cdr(t)) |
| if (cmu_is_vowel(val_string(val_car(t)))) |
| return TRUE; |
| return FALSE; |
| } |
| |
| static int cmu_has_vowel_in_syl(const cst_item *i) |
| { |
| const cst_item *n; |
| |
| for (n=i; n; n=item_prev(n)) |
| if (cmu_is_vowel(item_feat_string(n,"name"))) |
| return TRUE; |
| return FALSE; |
| } |
| |
| static int cmu_is_vowel(const char *p) |
| { |
| /* this happens to work for US English phoneset */ |
| if (strchr("aeiou",p[0]) == NULL) |
| return FALSE; |
| else |
| return TRUE; |
| } |
| |
| static int cmu_sonority(const char *p) |
| { |
| /* A bunch of hacks for US English phoneset */ |
| if (cmu_is_vowel(p) || (cmu_is_silence(p))) |
| return 5; |
| else if (strchr("wylr",p[0]) != NULL) |
| return 4; /* glides/liquids */ |
| else if (strchr("nm",p[0]) != NULL) |
| return 3; /* nasals */ |
| else if (strchr("bdgjlmnnnrvwyz",p[0]) != NULL) |
| return 2; /* voiced obstruents */ |
| else |
| return 1; |
| } |
| |
| int cmu_syl_boundary(const cst_item *i,const cst_val *rest) |
| { |
| /* Returns TRUE if this should be a syllable boundary */ |
| /* This is of course phone set dependent */ |
| int p, n, nn; |
| |
| if (rest == NULL) |
| return TRUE; |
| else if (cmu_is_silence(val_string(val_car(rest)))) |
| return TRUE; |
| else if (!cmu_has_vowel_in_list(rest)) /* no more vowels so rest *all* coda */ |
| return FALSE; |
| else if (!cmu_has_vowel_in_syl(i)) /* need a vowel */ |
| return FALSE; |
| else if (cmu_is_vowel(val_string(val_car(rest)))) |
| return TRUE; |
| else if (val_cdr(rest) == NULL) |
| return FALSE; |
| else |
| { /* so there is following vowel, and multiple phones left */ |
| p = cmu_sonority(item_feat_string(i,"name")); |
| n = cmu_sonority(val_string(val_car(rest))); |
| nn = cmu_sonority(val_string(val_car(val_cdr(rest)))); |
| |
| if ((p <= n) && (n <= nn)) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| } |
| |
| static int cmulex_dist_to_vowel(const cst_val *rest) |
| { |
| if (rest == 0) |
| return 0; /* shouldn't get here */ |
| else if (cmu_is_vowel(val_string(val_car(rest)))) |
| return 0; |
| else |
| return 1+cmulex_dist_to_vowel(val_cdr(rest)); |
| } |
| |
| static const char * const cmulex_onset_trigrams[] = { |
| "str", "spy", "spr", "spl", "sky", "skw", "skr", "skl", NULL |
| }; |
| static const char * const cmulex_onset_bigrams[] = { |
| "zw", "zl", |
| "vy", "vr", "vl", |
| "thw", "thr", |
| "ty", "tw", |
| "tr", /* "ts", */ |
| "shw", "shr", "shn", "shm", "shl", |
| "sw", "sv", "st", "sr", "sp", "sn", "sm", "sl", "sk", "sf", |
| "py", "pw", "pr", "pl", |
| "ny", |
| "my", "mr", |
| "ly", |
| "ky", "kw", "kr", "kl", |
| "hhy", "hhw", "hhr", "hhl", |
| "gy", "gw", "gr", "gl", |
| "fy", "fr", "fl", |
| "dy", "dw", "dr", |
| "by", "bw", "br", "bl", |
| NULL |
| }; |
| |
| static int cmulex_onset_bigram(const cst_val *rest) |
| { |
| char x[10]; |
| int i; |
| |
| cst_sprintf(x,"%s%s",val_string(val_car(rest)), |
| val_string(val_car(val_cdr(rest)))); |
| for (i=0; cmulex_onset_bigrams[i]; i++) |
| if (cst_streq(x,cmulex_onset_bigrams[i])) |
| return TRUE; |
| return FALSE; |
| } |
| |
| static int cmulex_onset_trigram(const cst_val *rest) |
| { |
| char x[15]; |
| int i; |
| |
| cst_sprintf(x,"%s%s%s",val_string(val_car(rest)), |
| val_string(val_car(val_cdr(rest))), |
| val_string(val_car(val_cdr(val_cdr(rest))))); |
| for (i=0; cmulex_onset_trigrams[i]; i++) |
| if (cst_streq(x,cmulex_onset_trigrams[i])) |
| return TRUE; |
| return FALSE; |
| } |
| |
| int cmu_syl_boundary_mo(const cst_item *i,const cst_val *rest) |
| { |
| /* syl boundary maximal onset */ |
| int d2v; |
| |
| if (rest == NULL) |
| return TRUE; |
| else if (cmu_is_silence(val_string(val_car(rest)))) |
| return TRUE; |
| else if (!cmu_has_vowel_in_list(rest)) |
| /* no more vowels so rest *all* coda */ |
| return FALSE; |
| else if (!cmu_has_vowel_in_syl(i)) /* need a vowel */ |
| /* no vowel yet in syl so keep copying */ |
| return FALSE; |
| else if (cmu_is_vowel(val_string(val_car(rest)))) |
| /* next is a vowel, syl has vowel, so this is a break */ |
| return TRUE; |
| else if (cst_streq("ng",val_string(val_car(rest)))) |
| /* next is "ng" which can't start a word internal syl */ |
| return FALSE; |
| else |
| { |
| /* want to know if from rest to the next vowel is a valid onset */ |
| d2v = cmulex_dist_to_vowel(rest); |
| if (d2v < 2) |
| return TRUE; |
| else if (d2v > 3) |
| return FALSE; |
| else if (d2v == 2) |
| return cmulex_onset_bigram(rest); |
| else /* if (d2v == 3) */ |
| return cmulex_onset_trigram(rest); |
| return TRUE; |
| } |
| |
| } |
| |
| cst_lexicon cmu_lex; |
| cst_lts_rules cmu_lts_rules; |
| extern const char * const cmu_lts_phone_table[]; |
| extern const char * const cmu_lts_letter_table[]; |
| extern const cst_lts_addr cmu_lts_letter_index[]; |
| extern const cst_lts_model cmu_lts_model[]; |
| |
| cst_lexicon *cmulex_init() |
| { |
| /* We actually need the init function match the directory name */ |
| return cmu_lex_init(); |
| } |
| |
| cst_lexicon *cmu_lex_init() |
| { |
| /* I'd like to do this as a const but it needs everything in this */ |
| /* file and already the bits are too big for some compilers */ |
| |
| if (cmu_lts_rules.name) |
| return &cmu_lex; /* Already initialized */ |
| |
| cmu_lts_rules.name = "cmu"; |
| cmu_lts_rules.letter_index = cmu_lts_letter_index; |
| #ifdef CST_NO_STATIC_LTS_MODEL |
| /* cmu_lts_rules.models will be set elsewhere */ |
| #else |
| cmu_lts_rules.models = cmu_lts_model; |
| #endif |
| cmu_lts_rules.phone_table = cmu_lts_phone_table; |
| cmu_lts_rules.context_window_size = 4; |
| cmu_lts_rules.context_extra_feats = 1; |
| cmu_lts_rules.letter_table = 0 /* cmu_lts_letter_table */; |
| |
| cmu_lex.name = "cmu"; |
| cmu_lex.num_entries = cmu_lex_num_entries; |
| #ifdef CST_NO_STATIC_LEX |
| /* cmu_lex.data will be set elsewhere */ |
| #else |
| /* as the data is const, we cast it through void * */ |
| cmu_lex.data = (unsigned char *)(void *)cmu_lex_data; |
| #endif |
| cmu_lex.num_bytes = cmu_lex_num_bytes; |
| cmu_lex.phone_table = (char **) cmu_lex_phone_table; |
| cmu_lex.syl_boundary = cmu_syl_boundary_mo; |
| cmu_lex.addenda = (char ***) addenda; |
| cmu_lex.lts_rule_set = (cst_lts_rules *) &cmu_lts_rules; |
| |
| cmu_lex.phone_hufftable = cmu_lex_phones_huff_table; |
| cmu_lex.entry_hufftable = cmu_lex_entries_huff_table; |
| |
| cmu_lex.postlex = cmu_postlex; |
| |
| return &cmu_lex; |
| |
| } |