blob: 525fb77191833e5bcf1d5d2a54af03bc01e9a2ec [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 1999 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: December 1999 */
/*************************************************************************/
/* */
/* Lexicon related functions */
/* */
/*************************************************************************/
#include "cst_features.h"
#include "cst_lexicon.h"
#include "cst_tokenstream.h"
CST_VAL_REGISTER_TYPE_NODEL(lexicon,cst_lexicon)
#define WP_SIZE 64
static int no_syl_boundaries(const cst_item *i, const cst_val *p);
static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l,
int *found);
static int lex_match_entry(const char *a, const char *b);
static int lex_lookup_bsearch(const cst_lexicon *l,const char *word);
static int find_full_match(const cst_lexicon *l,
int i,const char *word);
cst_lexicon *new_lexicon()
{
cst_lexicon *l = cst_alloc(cst_lexicon,1);
l->syl_boundary = no_syl_boundaries;
return l;
}
void delete_lexicon(cst_lexicon *lex)
{ /* But I doubt if this will ever be called, lexicons are mapped */
/* This probably isn't complete */
if (lex)
{
cst_free(lex->data);
cst_free(lex);
}
}
cst_val *cst_lex_load_addenda(const cst_lexicon *lex, const char *lexfile)
{ /* Load an addend from given file, check its phones wrt lex */
cst_tokenstream *lf;
const cst_string *line;
cst_val *e = NULL;
cst_val *na = NULL;
int i;
lf = ts_open(lexfile,"\n","","","");
if (lf == NULL)
{
cst_errmsg("lex_add_addenda: cannot open lexicon file\n");
return NULL;;
}
while (!ts_eof(lf))
{
line = ts_get(lf);
if (line[0] == '#')
continue; /* a comment */
for (i=0; line[i]; i++)
{
if (line[i] != ' ')
break;
}
if (line[i])
{
e = cst_lex_make_entry(lex,line);
if (e)
na = cons_val(e,na);
}
else
continue; /* a blank line */
}
ts_close(lf);
return val_reverse(na);
}
cst_val *cst_lex_make_entry(const cst_lexicon *lex, const cst_string *entry)
{ /* if replace then replace entry in addenda of lex with entry */
/* else append entry to addenda of lex */
cst_tokenstream *e;
cst_val *phones = NULL;
cst_val *ventry;
const cst_string *w, *p;
cst_string *word;
cst_string *pos;
int i;
e = ts_open_string(entry,
cst_ts_default_whitespacesymbols,
"","","");
w = ts_get(e);
if (w[0] == '"') /* it was a quoted entry */
{ /* so reparse it */
ts_close(e);
e = ts_open_string(entry,
cst_ts_default_whitespacesymbols,
"","","");
w = ts_get_quoted_token(e,'"','\\');
}
word = cst_strdup(w);
p = ts_get(e);
if (!cst_streq(":",p)) /* there is a real pos */
{
pos = cst_strdup(p);
p = ts_get(e);
if (!cst_streq(":",p)) /* there is a real pos */
{
cst_fprintf(stdout,"add_addenda: lex %s: expected \":\" in %s\n",
lex->name,
word);
cst_free(word); cst_free(pos); ts_close(e);
return NULL;
}
}
else
pos = cst_strdup("nil");
while (!ts_eof(e))
{
p = ts_get(e);
/* Check its a legal phone */
for (i=0; lex->phone_table[i]; i++)
{
if (cst_streq(p,lex->phone_table[i]))
break;
}
if (cst_streq("#",p)) /* comment to end of line */
break;
else if (cst_streq("",p)) /* trailing ws at eoln causes this */
break;
else if (lex->phone_table[i])
/* Only add it if its a valid phone */
phones = cons_val(string_val(p),phones);
else
{
cst_fprintf(stdout,"add_addenda: lex: %s word %s phone %s not in lexicon phoneset\n",
lex->name,
word,
p);
}
}
ventry = cons_val(string_val(word),cons_val(string_val(pos),
val_reverse(phones)));
cst_free(word); cst_free(pos); ts_close(e);
#if 0
printf("entry: ");
val_print(stdout,ventry);
printf("\n");
#endif
return ventry;
}
#if 0
void lexicon_register(cst_lexicon *lex)
{
/* Add given lexicon to list of known lexicons */
cst_lexicon **old_lexs;
int i;
old_lexs = flite_lexicons;
flite_num_lexicons++;
flite_lexicons = cst_alloc(cst_lexicon *,flite_num_lexicons);
for (i=0; i<flite_num_lexicons-1; i++)
flite_lexicons[i] = old_lexs[i];
flite_lexicons[i] = lex;
cst_free(old_lexs);
}
cst_lexicon *lexicon_select(const char *name)
{
int i;
for (i=0; i < flite_num_lexicons; i++)
if (cst_streq(name,flite_lexicons[i]->name))
return flite_lexicons[i];
return NULL;
}
#endif
static int no_syl_boundaries(const cst_item *i, const cst_val *p)
{
/* This is a default function that will normally be replaced */
/* for each lexicon */
(void)i;
(void)p;
return FALSE;
}
int in_lex(const cst_lexicon *l, const char *word, const char *pos,
const cst_features *feats)
{
/* return TRUE is its in the lexicon */
int r = FALSE, i;
char *wp;
wp = cst_alloc(char,cst_strlen(word)+2);
cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);
for (i=0; l->addenda && l->addenda[i]; i++)
{
if (((wp[0] == '0') || (wp[0] == l->addenda[i][0][0])) &&
(cst_streq(wp+1,l->addenda[i][0]+1)))
{
r = TRUE;
break;
}
}
if (!r && (lex_lookup_bsearch(l,wp) >= 0))
r = TRUE;
cst_free(wp);
return r;
}
cst_val *lex_lookup(const cst_lexicon *l, const char *word, const char *pos,
const cst_features *feats)
{
int index;
int p;
const char *q;
char *wp;
cst_val *phones = 0;
int found = FALSE;
wp = cst_alloc(char,cst_strlen(word)+2);
cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);
if (l->addenda)
phones = lex_lookup_addenda(wp,l,&found);
if (!found)
{
index = lex_lookup_bsearch(l,wp);
if (index >= 0)
{
if (l->phone_hufftable)
{
for (p=index-2; l->data[p]; p--)
for (q=l->phone_hufftable[l->data[p]]; *q; q++)
phones = cons_val(string_val(l->phone_table[(unsigned char)*q]),
phones);
}
else /* no compression -- should we still support this ? */
{
for (p=index-2; l->data[p]; p--)
phones = cons_val(string_val(l->phone_table[l->data[p]]),
phones);
}
phones = val_reverse(phones);
}
else if (l->lts_function)
{
phones = (l->lts_function)(l,word,"",feats);
}
else if (l->lts_rule_set)
{
phones = lts_apply(word,
"", /* more features if we had them */
l->lts_rule_set);
}
}
cst_free(wp);
return phones;
}
static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l,
int *found)
{
/* For those other words */
int i,j;
cst_val *phones;
phones = NULL;
for (i=0; l->addenda[i]; i++)
{
if (((wp[0] == '0') ||
(wp[0] == l->addenda[i][0][0]) ||
(l->addenda[i][0][0] == '0')) &&
(cst_streq(wp+1,l->addenda[i][0]+1)))
{
for (j=1; l->addenda[i][j]; j++)
phones = cons_val(string_val(l->addenda[i][j]),phones);
*found = TRUE;
return val_reverse(phones);
}
}
return NULL;
}
static int lex_uncompress_word(char *ucword,int max_size,
int p,const cst_lexicon *l)
{
int i,j=0,length;
unsigned char *cword;
if (l->entry_hufftable == 0)
/* can have "compressed" lexicons without compression */
cst_sprintf(ucword,"%s",&l->data[p]);
else
{
cword = &l->data[p];
for (i=0,j=0; cword[i]; i++)
{
length = cst_strlen(l->entry_hufftable[cword[i]]);
if (j+length+1<max_size)
{
memmove(ucword+j,l->entry_hufftable[cword[i]],length);
j += length;
}
else
break;
}
ucword[j] = '\0';
}
return j;
}
static int lex_data_next_entry(const cst_lexicon *l,int p,int end)
{
for (p++; p < end; p++)
if (l->data[p-1] == 255)
return p;
return end;
}
static int lex_data_prev_entry(const cst_lexicon *l,int p,int start)
{
for (p--; p > start; p--)
if (l->data[p-1] == 255)
return p;
return start;
}
static int lex_data_closest_entry(const cst_lexicon *l,int p,int start,int end)
{
int d;
d=0;
while ((p-d > start) &&
(p+d < end))
{
if (l->data[(p+d)-1] == 255)
{
return p+d;
}
else if (l->data[(p-d)-1] == 255)
{
return p-d;
}
d++;
}
return p-d;
}
static int lex_lookup_bsearch(const cst_lexicon *l, const char *word)
{
int start,mid,end,c;
/* needs to be longer than longest word in lexicon */
char word_pos[WP_SIZE];
start = 0;
end = l->num_bytes;
while (start < end) {
mid = (start + end)/2;
/* find previous entry start */
mid = lex_data_closest_entry(l,mid,start,end);
lex_uncompress_word(word_pos,WP_SIZE,mid,l);
c = lex_match_entry(word_pos,word);
if (c == 0)
{
return find_full_match(l,mid,word);
}
else if (c > 0)
end = mid;
else
{
start = lex_data_next_entry(l,mid + 1,end);
}
#if 0
if (l->data[start-1] == 255)
{
lex_uncompress_word(word_pos,WP_SIZE,start,l);
printf("start %s %d ",word_pos,start);
}
else
printf("start NULL %d ",start);
if (l->data[mid-1] == 255)
{
lex_uncompress_word(word_pos,WP_SIZE,mid,l);
printf("mid %s %d ",word_pos,mid);
}
else
printf("mid NULL %d ",mid);
if (l->data[end-1] == 255)
{
lex_uncompress_word(word_pos,WP_SIZE,end,l);
printf("end %s %d ",word_pos,end);
}
else
printf("end NULL %d ",end);
printf("\n");
#endif
}
return -1;
}
static int find_full_match(const cst_lexicon *l,
int i,const char *word)
{
/* found word, now look for actual match including pos */
int w, match=i;
/* needs to be longer than longest word in lexicon */
char word_pos[WP_SIZE];
for (w=i; w > 0; )
{
lex_uncompress_word(word_pos,WP_SIZE,w,l);
if (!cst_streq(word+1,word_pos+1))
break;
else if (cst_streq(word,word_pos))
return w;
match = w; /* if we can't find an exact match we'll take this one */
/* go back to last entry */
w = lex_data_prev_entry(l,w,0);
}
for (w=i; w < l->num_bytes;)
{
lex_uncompress_word(word_pos,WP_SIZE,w,l);
if (!cst_streq(word+1,word_pos+1))
break;
else if (cst_streq(word,word_pos))
return w;
/* go to next entry */
w = lex_data_next_entry(l,w,l->num_bytes);
}
return match;
}
static int lex_match_entry(const char *a, const char *b)
{
int c;
c = strcmp(a+1,b+1);
return c;
}