src/lexicon/cst_lexicon.c - third_party/flite - Git at Google

 /*************************************************************************/
 /*                                                                       */
 /*                  Language Technologies Institute                      */
 /*                     Carnegie Mellon University                        */
 /*                        Copyright (c) 1999                             */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
 /*               Date:  December 1999                                    */
 /*************************************************************************/
 /*                                                                       */
 /*  Lexicon related functions                                            */
 /*                                                                       */
 /*************************************************************************/

 #include "cst_features.h"
 #include "cst_lexicon.h"
 #include "cst_tokenstream.h"

 CST_VAL_REGISTER_TYPE_NODEL(lexicon,cst_lexicon)

 #define WP_SIZE 64

 static int no_syl_boundaries(const cst_item *i, const cst_val *p);
 static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l,
                                    int *found);

 static int lex_match_entry(const char *a, const char *b);
 static int lex_lookup_bsearch(const cst_lexicon *l,const char *word);
 static int find_full_match(const cst_lexicon *l,
 			   int i,const char *word);

 cst_lexicon *new_lexicon()
 {
     cst_lexicon *l = cst_alloc(cst_lexicon,1);
     l->syl_boundary = no_syl_boundaries;
     return l;
 }

 void delete_lexicon(cst_lexicon *lex)
 {   /* But I doubt if this will ever be called, lexicons are mapped */
     /* This probably isn't complete */
     if (lex)
     {
 	cst_free(lex->data);
 	cst_free(lex);
     }
 }

 cst_val *cst_lex_load_addenda(const cst_lexicon *lex, const char *lexfile)
 {   /* Load an addend from given file, check its phones wrt lex */
     cst_tokenstream *lf;
     const cst_string *line;
     cst_val *e = NULL;
     cst_val *na = NULL;
     int i;

     lf = ts_open(lexfile,"\n","","","");
     if (lf == NULL)
     {
 	cst_errmsg("lex_add_addenda: cannot open lexicon file\n");
         return NULL;;
     }

     while (!ts_eof(lf))
     {
         line = ts_get(lf);
         if (line[0] == '#')
             continue;  /* a comment */
         for (i=0; line[i]; i++)
         {
             if (line[i] != ' ')
                 break;
         }
         if (line[i])
         {
             e = cst_lex_make_entry(lex,line);
             if (e)
                 na = cons_val(e,na);
         }
         else
             continue;  /* a blank line */
     }

     ts_close(lf);
     return val_reverse(na);
 }

 cst_val *cst_lex_make_entry(const cst_lexicon *lex, const cst_string *entry)
 {   /* if replace then replace entry in addenda of lex with entry */
     /* else append entry to addenda of lex                        */
     cst_tokenstream *e;
     cst_val *phones = NULL;
     cst_val *ventry;
     const cst_string *w, *p;
     cst_string *word;
     cst_string *pos;
     int i;

     e = ts_open_string(entry,
                        cst_ts_default_whitespacesymbols,
                        "","","");

     w = ts_get(e);
     if (w[0] == '"') /* it was a quoted entry */
     {                   /* so reparse it */
         ts_close(e);
         e = ts_open_string(entry,
                            cst_ts_default_whitespacesymbols,
                            "","","");
         w = ts_get_quoted_token(e,'"','\\');
     }

     word = cst_strdup(w);
     p = ts_get(e);
     if (!cst_streq(":",p)) /* there is a real pos */
     {
         pos = cst_strdup(p);
         p = ts_get(e);
         if (!cst_streq(":",p)) /* there is a real pos */
         {
             cst_fprintf(stdout,"add_addenda: lex %s: expected \":\" in %s\n",
                         lex->name,
                         word);
             cst_free(word); cst_free(pos); ts_close(e);
             return NULL;
         }
     }
     else
         pos = cst_strdup("nil");

     while (!ts_eof(e))
     {
         p = ts_get(e);
         /* Check its a legal phone */
         for (i=0; lex->phone_table[i]; i++)
        {
             if (cst_streq(p,lex->phone_table[i]))
                 break;
         }
         if (cst_streq("#",p)) /* comment to end of line */
             break;
         else if (cst_streq("",p)) /* trailing ws at eoln causes this */
             break;
         else if (lex->phone_table[i])
             /* Only add it if its a valid phone */
             phones = cons_val(string_val(p),phones);
         else
         {
             cst_fprintf(stdout,"add_addenda: lex: %s word %s phone %s not in lexicon phoneset\n",
                         lex->name,
                         word,
                         p);
         }
     }

     ventry = cons_val(string_val(word),cons_val(string_val(pos),
                                                 val_reverse(phones)));
     cst_free(word); cst_free(pos); ts_close(e);
 #if 0
     printf("entry: ");
     val_print(stdout,ventry);
     printf("\n");
 #endif

     return ventry;
  }

 #if 0
 void lexicon_register(cst_lexicon *lex)
 {
     /* Add given lexicon to list of known lexicons */
     cst_lexicon **old_lexs;
     int i;

     old_lexs = flite_lexicons;
     flite_num_lexicons++;
     flite_lexicons = cst_alloc(cst_lexicon *,flite_num_lexicons);
     for (i=0; i<flite_num_lexicons-1; i++)
 	flite_lexicons[i] = old_lexs[i];
     flite_lexicons[i] = lex;
     cst_free(old_lexs);
 }

 cst_lexicon *lexicon_select(const char *name)
 {
     int i;

     for (i=0; i < flite_num_lexicons; i++)
 	if (cst_streq(name,flite_lexicons[i]->name))
 	    return flite_lexicons[i];
     return NULL;
 }
 #endif

 static int no_syl_boundaries(const cst_item *i, const cst_val *p)
 {
     /* This is a default function that will normally be replaced */
     /* for each lexicon                                         */
     (void)i;
     (void)p;
     return FALSE;
 }

 int in_lex(const cst_lexicon *l, const char *word, const char *pos,
            const cst_features *feats)
 {
     /* return TRUE is its in the lexicon */
     int r = FALSE, i;
     char *wp;

     wp = cst_alloc(char,cst_strlen(word)+2);
     cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);

     for (i=0; l->addenda && l->addenda[i]; i++)
     {
 	if (((wp[0] == '0') || (wp[0] == l->addenda[i][0][0])) &&
 	    (cst_streq(wp+1,l->addenda[i][0]+1)))
 	{
 	    r = TRUE;
 	    break;
 	}
     }

     if (!r && (lex_lookup_bsearch(l,wp) >= 0))
 	r = TRUE;

     cst_free(wp);
     return r;
 }

 cst_val *lex_lookup(const cst_lexicon *l, const char *word, const char *pos,
                     const cst_features *feats)
 {
     int index;
     int p;
     const char *q;
     char *wp;
     cst_val *phones = 0;
     int found = FALSE;

     wp = cst_alloc(char,cst_strlen(word)+2);
     cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);

     if (l->addenda)
 	phones = lex_lookup_addenda(wp,l,&found);

     if (!found)
     {
 	index = lex_lookup_bsearch(l,wp);

 	if (index >= 0)
 	{
 	    if (l->phone_hufftable)
 	    {
 		for (p=index-2; l->data[p]; p--)
 		    for (q=l->phone_hufftable[l->data[p]]; *q; q++)
 			phones = cons_val(string_val(l->phone_table[(unsigned char)*q]),
 				  phones);
 	    }
 	    else  /* no compression -- should we still support this ? */
 	    {
 		for (p=index-2; l->data[p]; p--)
 		    phones = cons_val(string_val(l->phone_table[l->data[p]]),
 				      phones);
 	    }
 	    phones = val_reverse(phones);
 	}
 	else if (l->lts_function)
 	{
 	    phones = (l->lts_function)(l,word,"",feats);
 	}
 	else if (l->lts_rule_set)
 	{
 	    phones = lts_apply(word,
 			       "",  /* more features if we had them */
 			       l->lts_rule_set);
 	}
     }

     cst_free(wp);

     return phones;
 }

 static cst_val *lex_lookup_addenda(const char *wp,const cst_lexicon *l,
 				   int *found)
 {
     /* For those other words */
     int i,j;
     cst_val *phones;

     phones = NULL;

     for (i=0; l->addenda[i]; i++)
     {
 	if (((wp[0] == '0') ||
              (wp[0] == l->addenda[i][0][0]) ||
              (l->addenda[i][0][0] == '0')) &&
 	    (cst_streq(wp+1,l->addenda[i][0]+1)))
 	{
 	    for (j=1; l->addenda[i][j]; j++)
 		phones = cons_val(string_val(l->addenda[i][j]),phones);
 	    *found = TRUE;
 	    return val_reverse(phones);
 	}
     }

     return NULL;
 }

 static int lex_uncompress_word(char *ucword,int max_size,
 			       int p,const cst_lexicon *l)
 {
     int i,j=0,length;
     unsigned char *cword;

     if (l->entry_hufftable == 0)
         /* can have "compressed" lexicons without compression */
 	cst_sprintf(ucword,"%s",&l->data[p]);
     else
     {
 	cword = &l->data[p];
 	for (i=0,j=0; cword[i]; i++)
 	{
 	    length = cst_strlen(l->entry_hufftable[cword[i]]);
 	    if (j+length+1<max_size)
 	    {
 		memmove(ucword+j,l->entry_hufftable[cword[i]],length);
 		j += length;
 	    }
 	    else
 		break;
 	}
 	ucword[j] = '\0';
     }

     return j;
 }
 static int lex_data_next_entry(const cst_lexicon *l,int p,int end)
 {
     for (p++; p < end; p++)
 	if (l->data[p-1] == 255)
 	    return p;
     return end;
 }
 static int lex_data_prev_entry(const cst_lexicon *l,int p,int start)
 {
     for (p--; p > start; p--)
 	if (l->data[p-1] == 255)
 	    return p;
     return start;
 }
 static int lex_data_closest_entry(const cst_lexicon *l,int p,int start,int end)
 {
     int d;

     d=0;
     while ((p-d > start) &&
 	   (p+d < end))
     {
 	if (l->data[(p+d)-1] == 255)
         {
 	    return p+d;
         }
 	else if (l->data[(p-d)-1] == 255)
         {
 	    return p-d;
         }
 	d++;
     }
     return p-d;
 }

 static int lex_lookup_bsearch(const cst_lexicon *l, const char *word)
 {
     int start,mid,end,c;
     /* needs to be longer than longest word in lexicon */
     char word_pos[WP_SIZE];

     start = 0;
     end = l->num_bytes;
     while (start < end) {
 	mid = (start + end)/2;

 	/* find previous entry start */
 	mid = lex_data_closest_entry(l,mid,start,end);
 	lex_uncompress_word(word_pos,WP_SIZE,mid,l);

 	c = lex_match_entry(word_pos,word);

 	if (c == 0)
         {
 	    return find_full_match(l,mid,word);
         }
 	else if (c > 0)
 	    end = mid;
 	else
         {
 	    start = lex_data_next_entry(l,mid + 1,end);
         }

 #if 0
 	if (l->data[start-1] == 255)
 	{
 	    lex_uncompress_word(word_pos,WP_SIZE,start,l);
 	    printf("start %s %d ",word_pos,start);
 	}
 	else
 	    printf("start NULL %d ",start);
 	if (l->data[mid-1] == 255)
 	{
 	    lex_uncompress_word(word_pos,WP_SIZE,mid,l);
 	    printf("mid %s %d ",word_pos,mid);
 	}
 	else
 	    printf("mid NULL %d ",mid);
 	if (l->data[end-1] == 255)
 	{
 	    lex_uncompress_word(word_pos,WP_SIZE,end,l);
 	    printf("end %s %d ",word_pos,end);
 	}
 	else
 	    printf("end NULL %d ",end);
 	printf("\n");
 #endif

     }
     return -1;
 }

 static int find_full_match(const cst_lexicon *l,
 			   int i,const char *word)

 {
     /* found word, now look for actual match including pos */
     int w, match=i;
     /* needs to be longer than longest word in lexicon */
     char word_pos[WP_SIZE];

     for (w=i; w > 0; )
     {
 	lex_uncompress_word(word_pos,WP_SIZE,w,l);
 	if (!cst_streq(word+1,word_pos+1))
 	    break;
 	else if (cst_streq(word,word_pos))
 	    return w;
 	match = w;  /* if we can't find an exact match we'll take this one */
         /* go back to last entry */
 	w = lex_data_prev_entry(l,w,0);
     }

     for (w=i; w < l->num_bytes;)
     {
 	lex_uncompress_word(word_pos,WP_SIZE,w,l);
 	if (!cst_streq(word+1,word_pos+1))
 	    break;
 	else if (cst_streq(word,word_pos))
 	    return w;
         /* go to next entry */
 	w = lex_data_next_entry(l,w,l->num_bytes);
     }

     return match;
 }

 static int lex_match_entry(const char *a, const char *b)
 {
     int c;

     c = strcmp(a+1,b+1);

     return c;
 }
	/*************************************************************************/
	/* */
	/* Language Technologies Institute */
	/* Carnegie Mellon University */
	/* Copyright (c) 1999 */
	/* All Rights Reserved. */
	/* */
	/* Permission is hereby granted, free of charge, to use and distribute */
	/* this software and its documentation without restriction, including */
	/* without limitation the rights to use, copy, modify, merge, publish, */
	/* distribute, sublicense, and/or sell copies of this work, and to */
	/* permit persons to whom this work is furnished to do so, subject to */
	/* the following conditions: */
	/* 1. The code must retain the above copyright notice, this list of */
	/* conditions and the following disclaimer. */
	/* 2. Any modifications must be clearly marked as such. */
	/* 3. Original authors' names are not deleted. */
	/* 4. The authors' names are not used to endorse or promote products */
	/* derived from this software without specific prior written */
	/* permission. */
	/* */
	/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
	/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
	/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
	/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
	/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
	/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
	/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
	/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
	/* THIS SOFTWARE. */
	/* */
	/*************************************************************************/
	/* Author: Alan W Black (awb@cs.cmu.edu) */
	/* Date: December 1999 */
	/*************************************************************************/
	/* */
	/* Lexicon related functions */
	/* */
	/*************************************************************************/

	#include "cst_features.h"
	#include "cst_lexicon.h"
	#include "cst_tokenstream.h"

	CST_VAL_REGISTER_TYPE_NODEL(lexicon,cst_lexicon)

	#define WP_SIZE 64

	static int no_syl_boundaries(const cst_item i, const cst_val p);
	static cst_val lex_lookup_addenda(const char wp,const cst_lexicon *l,
	int *found);

	static int lex_match_entry(const char a, const char b);
	static int lex_lookup_bsearch(const cst_lexicon l,const char word);
	static int find_full_match(const cst_lexicon *l,
	int i,const char *word);

	cst_lexicon *new_lexicon()
	{
	cst_lexicon *l = cst_alloc(cst_lexicon,1);
	l->syl_boundary = no_syl_boundaries;
	return l;
	}

	void delete_lexicon(cst_lexicon *lex)
	{ /* But I doubt if this will ever be called, lexicons are mapped */
	/* This probably isn't complete */
	if (lex)
	{
	cst_free(lex->data);
	cst_free(lex);
	}
	}

	cst_val cst_lex_load_addenda(const cst_lexicon lex, const char *lexfile)
	{ /* Load an addend from given file, check its phones wrt lex */
	cst_tokenstream *lf;
	const cst_string *line;
	cst_val *e = NULL;
	cst_val *na = NULL;
	int i;

	lf = ts_open(lexfile,"\n","","","");
	if (lf == NULL)
	{
	cst_errmsg("lex_add_addenda: cannot open lexicon file\n");
	return NULL;;
	}

	while (!ts_eof(lf))
	{
	line = ts_get(lf);
	if (line[0] == '#')
	continue; /* a comment */
	for (i=0; line[i]; i++)
	{
	if (line[i] != ' ')
	break;
	}
	if (line[i])
	{
	e = cst_lex_make_entry(lex,line);
	if (e)
	na = cons_val(e,na);
	}
	else
	continue; /* a blank line */
	}

	ts_close(lf);
	return val_reverse(na);
	}

	cst_val cst_lex_make_entry(const cst_lexicon lex, const cst_string *entry)
	{ /* if replace then replace entry in addenda of lex with entry */
	/* else append entry to addenda of lex */
	cst_tokenstream *e;
	cst_val *phones = NULL;
	cst_val *ventry;
	const cst_string w, p;
	cst_string *word;
	cst_string *pos;
	int i;

	e = ts_open_string(entry,
	cst_ts_default_whitespacesymbols,
	"","","");

	w = ts_get(e);
	if (w[0] == '"') /* it was a quoted entry */
	{ /* so reparse it */
	ts_close(e);
	e = ts_open_string(entry,
	cst_ts_default_whitespacesymbols,
	"","","");
	w = ts_get_quoted_token(e,'"','\\');
	}

	word = cst_strdup(w);
	p = ts_get(e);
	if (!cst_streq(":",p)) /* there is a real pos */
	{
	pos = cst_strdup(p);
	p = ts_get(e);
	if (!cst_streq(":",p)) /* there is a real pos */
	{
	cst_fprintf(stdout,"add_addenda: lex %s: expected \":\" in %s\n",
	lex->name,
	word);
	cst_free(word); cst_free(pos); ts_close(e);
	return NULL;
	}
	}
	else
	pos = cst_strdup("nil");

	while (!ts_eof(e))
	{
	p = ts_get(e);
	/* Check its a legal phone */
	for (i=0; lex->phone_table[i]; i++)
	{
	if (cst_streq(p,lex->phone_table[i]))
	break;
	}
	if (cst_streq("#",p)) /* comment to end of line */
	break;
	else if (cst_streq("",p)) /* trailing ws at eoln causes this */
	break;
	else if (lex->phone_table[i])
	/* Only add it if its a valid phone */
	phones = cons_val(string_val(p),phones);
	else
	{
	cst_fprintf(stdout,"add_addenda: lex: %s word %s phone %s not in lexicon phoneset\n",
	lex->name,
	word,
	p);
	}
	}

	ventry = cons_val(string_val(word),cons_val(string_val(pos),
	val_reverse(phones)));
	cst_free(word); cst_free(pos); ts_close(e);
	#if 0
	printf("entry: ");
	val_print(stdout,ventry);
	printf("\n");
	#endif

	return ventry;
	}

	#if 0
	void lexicon_register(cst_lexicon *lex)
	{
	/* Add given lexicon to list of known lexicons */
	cst_lexicon **old_lexs;
	int i;

	old_lexs = flite_lexicons;
	flite_num_lexicons++;
	flite_lexicons = cst_alloc(cst_lexicon *,flite_num_lexicons);
	for (i=0; i<flite_num_lexicons-1; i++)
	flite_lexicons[i] = old_lexs[i];
	flite_lexicons[i] = lex;
	cst_free(old_lexs);
	}

	cst_lexicon lexicon_select(const char name)
	{
	int i;

	for (i=0; i < flite_num_lexicons; i++)
	if (cst_streq(name,flite_lexicons[i]->name))
	return flite_lexicons[i];
	return NULL;
	}
	#endif

	static int no_syl_boundaries(const cst_item i, const cst_val p)
	{
	/* This is a default function that will normally be replaced */
	/* for each lexicon */
	(void)i;
	(void)p;
	return FALSE;
	}

	int in_lex(const cst_lexicon l, const char word, const char *pos,
	const cst_features *feats)
	{
	/* return TRUE is its in the lexicon */
	int r = FALSE, i;
	char *wp;

	wp = cst_alloc(char,cst_strlen(word)+2);
	cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);

	for (i=0; l->addenda && l->addenda[i]; i++)
	{
	if (((wp[0] == '0') \|\| (wp[0] == l->addenda[i][0][0])) &&
	(cst_streq(wp+1,l->addenda[i][0]+1)))
	{
	r = TRUE;
	break;
	}
	}

	if (!r && (lex_lookup_bsearch(l,wp) >= 0))
	r = TRUE;

	cst_free(wp);
	return r;
	}

	cst_val lex_lookup(const cst_lexicon l, const char word, const char pos,
	const cst_features *feats)
	{
	int index;
	int p;
	const char *q;
	char *wp;
	cst_val *phones = 0;
	int found = FALSE;

	wp = cst_alloc(char,cst_strlen(word)+2);
	cst_sprintf(wp,"%c%s",(pos ? pos[0] : '0'),word);

	if (l->addenda)
	phones = lex_lookup_addenda(wp,l,&found);

	if (!found)
	{
	index = lex_lookup_bsearch(l,wp);

	if (index >= 0)
	{
	if (l->phone_hufftable)
	{
	for (p=index-2; l->data[p]; p--)
	for (q=l->phone_hufftable[l->data[p]]; *q; q++)
	phones = cons_val(string_val(l->phone_table[(unsigned char)*q]),
	phones);
	}
	else /* no compression -- should we still support this ? */
	{
	for (p=index-2; l->data[p]; p--)
	phones = cons_val(string_val(l->phone_table[l->data[p]]),
	phones);
	}
	phones = val_reverse(phones);
	}
	else if (l->lts_function)
	{
	phones = (l->lts_function)(l,word,"",feats);
	}
	else if (l->lts_rule_set)
	{
	phones = lts_apply(word,
	"", /* more features if we had them */
	l->lts_rule_set);
	}
	}

	cst_free(wp);

	return phones;
	}

	static cst_val lex_lookup_addenda(const char wp,const cst_lexicon *l,
	int *found)
	{
	/* For those other words */
	int i,j;
	cst_val *phones;

	phones = NULL;

	for (i=0; l->addenda[i]; i++)
	{
	if (((wp[0] == '0') \|\|
	(wp[0] == l->addenda[i][0][0]) \|\|
	(l->addenda[i][0][0] == '0')) &&
	(cst_streq(wp+1,l->addenda[i][0]+1)))
	{
	for (j=1; l->addenda[i][j]; j++)
	phones = cons_val(string_val(l->addenda[i][j]),phones);
	*found = TRUE;
	return val_reverse(phones);
	}
	}

	return NULL;
	}

	static int lex_uncompress_word(char *ucword,int max_size,
	int p,const cst_lexicon *l)
	{
	int i,j=0,length;
	unsigned char *cword;

	if (l->entry_hufftable == 0)
	/* can have "compressed" lexicons without compression */
	cst_sprintf(ucword,"%s",&l->data[p]);
	else
	{
	cword = &l->data[p];
	for (i=0,j=0; cword[i]; i++)
	{
	length = cst_strlen(l->entry_hufftable[cword[i]]);
	if (j+length+1<max_size)
	{
	memmove(ucword+j,l->entry_hufftable[cword[i]],length);
	j += length;
	}
	else
	break;
	}
	ucword[j] = '\0';
	}

	return j;
	}
	static int lex_data_next_entry(const cst_lexicon *l,int p,int end)
	{
	for (p++; p < end; p++)
	if (l->data[p-1] == 255)
	return p;
	return end;
	}
	static int lex_data_prev_entry(const cst_lexicon *l,int p,int start)
	{
	for (p--; p > start; p--)
	if (l->data[p-1] == 255)
	return p;
	return start;
	}
	static int lex_data_closest_entry(const cst_lexicon *l,int p,int start,int end)
	{
	int d;

	d=0;
	while ((p-d > start) &&
	(p+d < end))
	{
	if (l->data[(p+d)-1] == 255)
	{
	return p+d;
	}
	else if (l->data[(p-d)-1] == 255)
	{
	return p-d;
	}
	d++;
	}
	return p-d;
	}

	static int lex_lookup_bsearch(const cst_lexicon l, const char word)
	{
	int start,mid,end,c;
	/* needs to be longer than longest word in lexicon */
	char word_pos[WP_SIZE];

	start = 0;
	end = l->num_bytes;
	while (start < end) {
	mid = (start + end)/2;

	/* find previous entry start */
	mid = lex_data_closest_entry(l,mid,start,end);
	lex_uncompress_word(word_pos,WP_SIZE,mid,l);

	c = lex_match_entry(word_pos,word);

	if (c == 0)
	{
	return find_full_match(l,mid,word);
	}
	else if (c > 0)
	end = mid;
	else
	{
	start = lex_data_next_entry(l,mid + 1,end);
	}

	#if 0
	if (l->data[start-1] == 255)
	{
	lex_uncompress_word(word_pos,WP_SIZE,start,l);
	printf("start %s %d ",word_pos,start);
	}
	else
	printf("start NULL %d ",start);
	if (l->data[mid-1] == 255)
	{
	lex_uncompress_word(word_pos,WP_SIZE,mid,l);
	printf("mid %s %d ",word_pos,mid);
	}
	else
	printf("mid NULL %d ",mid);
	if (l->data[end-1] == 255)
	{
	lex_uncompress_word(word_pos,WP_SIZE,end,l);
	printf("end %s %d ",word_pos,end);
	}
	else
	printf("end NULL %d ",end);
	printf("\n");
	#endif

	}
	return -1;
	}

	static int find_full_match(const cst_lexicon *l,
	int i,const char *word)

	{
	/* found word, now look for actual match including pos */
	int w, match=i;
	/* needs to be longer than longest word in lexicon */
	char word_pos[WP_SIZE];

	for (w=i; w > 0; )
	{
	lex_uncompress_word(word_pos,WP_SIZE,w,l);
	if (!cst_streq(word+1,word_pos+1))
	break;
	else if (cst_streq(word,word_pos))
	return w;
	match = w; /* if we can't find an exact match we'll take this one */
	/* go back to last entry */
	w = lex_data_prev_entry(l,w,0);
	}

	for (w=i; w < l->num_bytes;)
	{
	lex_uncompress_word(word_pos,WP_SIZE,w,l);
	if (!cst_streq(word+1,word_pos+1))
	break;
	else if (cst_streq(word,word_pos))
	return w;
	/* go to next entry */
	w = lex_data_next_entry(l,w,l->num_bytes);
	}

	return match;
	}

	static int lex_match_entry(const char a, const char b)
	{
	int c;

	c = strcmp(a+1,b+1);

	return c;
	}