lang/usenglish/us_text.c - third_party/flite - Git at Google

 /*************************************************************************/
 /*                                                                       */
 /*                  Language Technologies Institute                      */
 /*                     Carnegie Mellon University                        */
 /*                         Copyright (c) 2001                            */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
 /*               Date:  January 2001                                     */
 /*************************************************************************/
 /*                                                                       */
 /*  US English text analysis functions                                   */
 /*                                                                       */
 /*************************************************************************/

 #include <ctype.h>
 #include "flite.h"
 #include "usenglish.h"
 #include "us_text.h"
 #include "cst_regex.h"

 static int text_splitable(const char *s,int i);
 static cst_val *state_name(const char *name,cst_item *t);

 /* compiled us regexes */
 #include "us_regexes.h"

 /* Note you need to also update the wandm regex in make_us_regeses too */
 static const char * const wandm_abbrevs[99][2] =
 {
     { "LB", "pounds" },
     { "LBS", "pounds" },
     { "lb", "pounds" },
     { "lbs", "pounds" },
     { "ft", "feet" },
     { "FT", "feet" },
     { "kg", "kilograms" },
     { "km", "kilometers" },
     { "cm", "centimeters" },
     { "mm", "millimeters" },
     { "ml", "milliliters" },
     { "oz", "ounces" },
     { "hz", "hertz" },
     { "Hz", "hertz" },
     { "HZ", "hertz" },
     { "KHz", "kilohertz" },
     { "MHz", "megahertz" },
     { "GHz", "gigahertz" },
     { "KB", "kilobytes" },
     { "GB", "gigabytes" },
     { "MB", "megabytes" },
     { "TB", "terabytes" },
     { NULL, NULL },
 };

 static const char * const eedwords[] = {
     "to",
     "can",
     "can't",
     "cannot",
     "cant",
     "could",
     "couldn't",
     "couldnt",
     "will",
     "shall",
     NULL};

 static int rex_like(const cst_item *t)
 {
     /* returns 1 if this is in a king like context */
     char *pn = cst_downcase(ffeature_string(t,"p.name"));
     char *ppn = cst_downcase(ffeature_string(t,"p.p.name"));
     int v = 0;

     if (cst_streq(pn,"louis") ||
 	cst_streq(pn,"henry") ||
 	cst_streq(pn,"charles") ||
 	cst_streq(pn,"philip") ||
 	cst_streq(pn,"george") ||
 	cst_streq(pn,"edward") ||
 	cst_streq(pn,"pius") ||
 	cst_streq(pn,"william") ||
 	cst_streq(pn,"richard") ||
 	cst_streq(pn,"ptolemy") ||
 	cst_streq(pn,"john") ||
 	cst_streq(pn,"paul") ||
 	cst_streq(pn,"peter") ||
 	cst_streq(pn,"nicholas") ||
 	cst_streq(pn,"frederick") ||
 	cst_streq(pn,"james") ||
 	cst_streq(pn,"alfonso") ||
 	cst_streq(pn,"ivan") ||
 	cst_streq(pn,"napolean") ||
 	cst_streq(pn,"leo") ||
 	cst_streq(pn,"gregory") ||
 	cst_streq(pn,"catherine") ||
 	cst_streq(pn,"alexandria") ||
 	cst_streq(pn,"pierre") ||
 	cst_streq(pn,"elizabeth") ||
 	cst_streq(pn,"mary"))
 	v = 1;
     else if (cst_streq(ppn,"king") ||
 	     cst_streq(ppn,"queen") ||
 	     cst_streq(ppn,"pope") ||
 	     cst_streq(ppn,"duke") ||
 	     cst_streq(ppn,"tsar") ||
 	     cst_streq(ppn,"emperor") ||
 	     cst_streq(ppn,"shah") ||
 	     cst_streq(ppn,"ceasar") ||
 	     cst_streq(ppn,"duchess") ||
 	     cst_streq(ppn,"tsarina") ||
 	     cst_streq(ppn,"empress") ||
 	     cst_streq(ppn,"baron") ||
 	     cst_streq(ppn,"baroness") ||
 	     cst_streq(ppn,"sultan") ||
 	     cst_streq(ppn,"count") ||
 	     cst_streq(ppn,"countess"))
 	v = 1;

     cst_free(pn);
     cst_free(ppn);
     return v;
 }

 static int section_like(const cst_item *t)
 {
     /* returns 1 if this is in a king like context */
     char *pn = cst_downcase(ffeature_string(t,"p.name"));
     int v = 0;

     if (cst_streq(pn,"section") ||
 	cst_streq(pn,"chapter") ||
 	cst_streq(pn,"part") ||
 	cst_streq(pn,"phrase") ||
 	cst_streq(pn,"verse") ||
 	cst_streq(pn,"scene") ||
 	cst_streq(pn,"act") ||
 	cst_streq(pn,"book") ||
 	cst_streq(pn,"volume") ||
 	cst_streq(pn,"chap") ||
 	cst_streq(pn,"war") ||
 	cst_streq(pn,"apollo") ||
 	cst_streq(pn,"trek") ||
 	cst_streq(pn,"fortran"))
 	v = 1;

     cst_free(pn);

     return v;
 }

 cst_utterance *us_textanalysis(cst_utterance *u)
 {
     if (!feat_present(u->features, "tokentowords_func"))
 	utt_set_feat(u, "tokentowords_func", itemfunc_val(us_tokentowords));

     return default_textanalysis(u);
 }

 static cst_val *us_tokentowords_one(cst_item *token, const char *name);
 cst_val *us_tokentowords(cst_item *token)
 {
     return us_tokentowords_one(token, item_feat_string(token, "name"));
 }

 static cst_val *add_break(cst_val *l)
 {
     /* add feature (break 1) to last item in this list */
     const cst_val *i;
     cst_val *t;
     cst_features *f;

     for (i=l; val_cdr(i); i=val_cdr(i));

     if (i)  /* might be empty list */
     {
 	f = new_features();
 	feat_set_string(f,"break","1");
 	t = cons_val(val_car(i),features_val(f));
 	set_car((cst_val *)i,t);
     }

     return l;
 }

 static int contains_unicode_single_quote(const char *name)
 {
     static const char *unicode_single_quote = "’";
     int i;

     for (i=0; name[i]; i++)
     {
         /* No check if name is long enough as it'll have NULL before end */
         if ((name[i] == unicode_single_quote[0]) &&
             (name[i+1] == unicode_single_quote[1]) &&
             (name[i+2] == unicode_single_quote[2]))
             return TRUE;
     }
     return FALSE;
 }

 static char *map_unicode_single_quote(const char *name)
 {
     static const char *unicode_single_quote = "’";
     int i,j;
     char *aaa = cst_strdup(name);  /* it'll always get shorter */

     for (i=0,j=0; name[i]; i++,j++)
     {
         if ((name[i] == unicode_single_quote[0]) &&
             (name[i+1] == unicode_single_quote[1]) &&
             (name[i+2] == unicode_single_quote[2]))
         {
             aaa[j] = '\'';
             i+=2;
         }
         else
             aaa[j] = name[i];
     }
     aaa[j] = '\0';

     return aaa;
 }

 static cst_val *us_tokentowords_one(cst_item *token, const char *name)
 {
     /* Return list of words that expand token/name */
     char *p, *aaa, *bbb, *ccc;
     int i,j,k,l;
     cst_val *r, *s, *ss;
     const cst_val *rr;
     const char *nsw = "";
     const char *ssml_alias = "";
     const char *token_name = "";
     cst_lexicon *lex;
     cst_utterance *utt;
     /* printf("token_name %s name %s\n",item_name(token),name); */
     /* FIXME: For SAPI and friends, any tokens with explicit
        pronunciations need to be passed through as-is.  This should be
        done in the interface code rather than here once the
        tokentowords hook is accessible. AWB: no, they should set the
        nsw feature and this function should deal with it (doesn't yet though)*/
     if (item_feat_present(token,"phones"))
 	return cons_val(string_val(name),NULL);

     if (item_feat_present(token,"nsw"))
 	nsw = item_feat_string(token,"nsw");

     token_name = item_name(token);

     if ((item_feat_present(token,"ssml_alias")) &&
         (!cst_streq(token_name,name)))  /* and we are not recursing */
     {
         /* SSML has given a substitute for this (and more) tokens */
         /* NOTE: the alias is not put through text normalization */
         ssml_alias = item_feat_string(token,"ssml_alias");
         if (cst_streq(ssml_alias,ffeature_string(token,"p.ssml_alias")))
             /* The first token gets the substitution */
             return NULL;
         else
         {
             return cons_val(string_val(ssml_alias),NULL);
         }
     }

     utt = item_utt(token);
     lex = val_lexicon(feat_val(utt->features,"lexicon"));

     if (cst_streq("1",get_param_string(item_feats(token),"ssml_comment","0")))
         r = NULL;
     else if ((cst_streq("a",name) || cst_streq("A",name)) &&
         ((item_next(token) == 0) ||
          (!cst_streq(name,item_name(token))) ||
          (!cst_streq("",ffeature_string(token,"punc")))))
     {   /* if A is a sub part of a token, then its ey not ah */
 	r = cons_val(string_val("_a"),0);
     }
     else if (cst_strlen(name) == 0)
         r = NULL;
     else if (cst_regex_match(dottedabbrevs,name))
     {   /* X.X.X */
 	aaa = cst_strdup(name);
 	for (i=j=0; aaa[i]; i++)
 	    if (aaa[i] != '.')
 	    {
 		aaa[j] = aaa[i];
 		j++;
 	    }
 	aaa[j] = '\0';
 	r = en_exp_letters(aaa);
 	cst_free(aaa);
     }
     else if (cst_regex_match(cst_rx_commaint,name))
     {   /* 99,999,999 */
 	aaa = cst_strdup(name);
 	for (j=i=0; i < cst_strlen(name); i++)
 	    if (name[i] != ',')
 	    {
 		aaa[j] = name[i];
 		j++;
 	    }
 	aaa[j] = '\0';
 	r = en_exp_real(aaa);
 	cst_free(aaa);
     }
     else if (cst_regex_match(sevenphonenumber,name))
     {   /* 234-3434 telephone numbers */
 	p=strchr(name,'-');
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	bbb = cst_strdup(p+1);
 	r = val_append(add_break(en_exp_digits(aaa)),
 		       en_exp_digits(bbb));
 	cst_free(aaa);
 	cst_free(bbb);
     }
     else if
      ((cst_regex_match(threedigits,name) &&
       ((!cst_regex_match(cst_rx_digits,ffeature_string(token,"p.name"))
 	&& cst_regex_match(threedigits,ffeature_string(token,"n.name"))
 	&& cst_regex_match(fourdigits,ffeature_string(token,"n.n.name"))) ||
        (cst_regex_match(sevenphonenumber,ffeature_string(token,"n.name"))) ||
        (!cst_regex_match(cst_rx_digits,ffeature_string(token,"p.p.name"))
 	&& cst_regex_match(threedigits,ffeature_string(token,"p.name"))
 	&& cst_regex_match(fourdigits,ffeature_string(token,"n.name"))))) ||
       (cst_regex_match(fourdigits,name) &&
        (!cst_regex_match(cst_rx_digits,ffeature_string(token,"n.name"))
 	&& cst_regex_match(threedigits,ffeature_string(token,"p.name"))
 	&& cst_regex_match(threedigits,ffeature_string(token,"p.p.name")))))
     {
 	/* part of a telephone number */
 	if (cst_streq("",ffeature_string(token,"punc")))
 	    item_set_string(token,"punc",",");
 	r = add_break(en_exp_digits(name));
     }
     else if (cst_regex_match(numbertime,name))
     {
 	p=strchr(name,':');
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	bbb = cst_strdup(p+1);

 	r = en_exp_number(aaa);
 	if (!cst_streq("00",bbb))
 	    r = val_append(r,en_exp_id(bbb));
 	/* r = add_break(r); */

 	cst_free(aaa);
 	cst_free(bbb);
     }
     else if (cst_regex_match(numbertimexm,name))
     {
 	p=strchr(name,':');
         if (!p) p=strchr(name,'.');
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	bbb = cst_strdup(p+1);
 	bbb[2] = '\0';
         ccc = cst_strdup(p+3);

 	r = en_exp_number(aaa);
 	if (!cst_streq("00",bbb))
 	    r = val_append(r,en_exp_id(bbb));
 	/* r = add_break(r); */

         r = val_append(r,en_exp_letters(ccc));

 	cst_free(aaa);
 	cst_free(bbb);
 	cst_free(ccc);
     }
     else if (cst_regex_match(digits2dash,name))
     {   /* 999-999-999 etc */
 	bbb = cst_strdup(name);
 	for (ss=0,aaa=p=bbb; *p; p++)
 	{
 	    if (*p == '-')
 	    {
 		*p = '\0';
 		ss = cons_val(string_val(aaa),ss);
 		aaa = p+1;
 	    }
 	}
         ss = cons_val(string_val(aaa),ss);
         if ((val_length(ss) == 2) &&
             (atoi(val_string(val_car(val_cdr(ss)))) <
              atoi(val_string(val_car(ss)))))  /* its a number range */
         {
             /* Should get 22-23 November, or 1998-1999 right */
             r =
                 val_append(us_tokentowords_one(token,val_string(val_car(val_cdr(ss)))),
                   cons_val(string_val("to"),
                    us_tokentowords_one(token,val_string(val_car(ss)))));
         }
         else
         {  /* Its just a bunch of ids */
             r = 0;
             for (rr=ss; rr; rr=val_cdr(rr))
             {
                 r = val_append(
                      add_break(en_exp_digits(val_string(val_car(rr)))),r);

             }
         }
         delete_val(ss);
 	cst_free(bbb);
     }
     else if (cst_regex_match(cst_rx_digits,name))
     {   /* string of digits (use cart to disambiguate) */
 	if (cst_streq("nide",nsw))
 	    r = en_exp_id(name);
 	else {
 	    const cst_val *tv;
 	    const char *ts;
 	    char *rname;

 	    rname = cst_strdup(item_feat_string(token,"name"));
 	    if (cst_streq(name,rname))
 		tv = cart_interpret(token,&us_nums_cart);
 	    else
 	    {   /* in a recursive call */
 		item_set_string(token,"name",name);
 		tv = cart_interpret(token,&us_nums_cart);
 		item_set_string(token,"name",rname);
 	    }
 	    cst_free(rname);
 	    ts = val_string(tv);
 	    if (cst_streq(ts,"ordinal"))
 		r = en_exp_ordinal(name);
 	    else if (cst_streq(ts,"digits"))
 		r = en_exp_digits(name);
 	    else if (cst_streq(ts,"year"))
 		r = en_exp_id(name);
 	    else
 		r = en_exp_number(name);
 	}
     }
     else if (cst_regex_match(romannums,name))
     {   /* Roman numerals */
 	if (cst_streq("",ffeature_string(token,"p.punc")))
 	{   /* no preceeding punc */
 	    char n[10];
 	    cst_sprintf(n,"%d",en_exp_roman(name));
 	    if (rex_like(token))
 		r = cons_val(string_val("the"),
 			     en_exp_ordinal(n));
 	    else if (section_like(token))
 		r = en_exp_number(n);
 	    else
 		r = en_exp_letters(name);
 	}
 	else
 	    r = en_exp_letters(name);
     }
     else if (cst_regex_match(drst,name))
     {   /* St Andrew's St, Dr King Dr */
 	const char *street;
 	const char *saint;
 	if ((name[0] == 's') || (name[0] == 'S'))
 	{
 	    street = "street";
 	    saint = "saint";
 	}
 	else
 	{
 	    street = "drive";
 	    saint = "doctor";
 	}
 	if ((item_next(token) == 0) ||
 	    strchr(item_feat_string(token,"punc"),','))
 	    r = cons_val(string_val(street),NULL);
 	else if (strchr(ffeature_string(token,"punc"),','))
 	    r = cons_val(string_val(saint),NULL);
 	else
 	{
 	    const char *pname = ffeature_string(token,"p.name");
 	    const char *nname = ffeature_string(token,"n.name");
 	    if ((pname[0] >= 'A') && (pname[0] <= 'Z') &&
 		(nname[0] >= 'a') && (nname[0] <= 'z'))
 		r = cons_val(string_val(street),NULL);
 	    else if ((pname[0] >= '0') && (pname[0] <= '9') &&
 		     (nname[0] >= 'a') && (nname[0] <= 'z'))
 		r = cons_val(string_val(street),NULL);
 	    else if ((pname[0] >= 'a') && (pname[0] <= 'z') &&
 		     (nname[0] >= 'A') && (nname[0] <= 'Z'))
 		r = cons_val(string_val(saint),NULL);
 	    else if (cst_streq(ffeature_string(token,"n.whitespace")," "))
 		r = cons_val(string_val(saint),NULL);
 	    else
 		r = cons_val(string_val(street),NULL);
 	}
 	if (cst_streq(item_feat_string(token,"punc"),"."))
 	    item_set_string(token,"punc","");
     }
     else if (cst_streq(name,"Mr"))
     {
 	item_set_string(token,"punc","");
 	r = cons_val(string_val("mister"),NULL);
     }
     else if (cst_streq(name,"Mrs"))
     {
 	item_set_string(token,"punc","");
 	r = cons_val(string_val("missus"),NULL);
     }
     else if ((cst_streq(name,"read")) ||
              (cst_streq(name,"lead")))
     {   /* checking WSJ examples, this seems a quick and easy way to */
         /* get many of these correct */
         const char *pname = ffeature_string(token,"p.name");

         for (i=0; eedwords[i]; i++)
             if (cst_streq(pname,eedwords[i]))
                 break;

         if (eedwords[i])
         {  /* reed or leed */
             if (name[0] == 'r')
                 r = cons_val(string_val("reed"),NULL);
             else
                 r = cons_val(string_val("leed"),NULL);
         }
         else /* red or led */
         {
             if (name[0] == 'r')
                 r = cons_val(string_val("red"),NULL);
             else
                 r = cons_val(string_val("led"),NULL);
         }
     }
     else if (cst_streq(name,"am") || cst_streq(name,"AM"))
     {
         if (!cst_streq(name,item_name(token)))
             r = en_exp_letters(name);
         else if (item_prev(token) &&
                  (cst_regex_match(numbertime,ffeature_string(token,"p.name")) ||
                   cst_regex_match(cst_rx_digits,ffeature_string(token,"p.name"))))
             r = en_exp_letters(name);
         else
             r = cons_val(string_val(name),NULL);
     }
     else if ((cst_strlen(name) == 1) &&
 	     (name[0] >= 'A') &&
 	     (name[0] <= 'Z') &&
 	     (cst_streq(" ",ffeature_string(token,"n.whitespace"))) &&
 	     (ffeature_string(token,"n.name")[0] >= 'A') &&
 	     (ffeature_string(token,"n.name")[0] <= 'Z'))
     {
 	item_set_string(token,"punc","");
 	aaa = cst_downcase(name);
 	if (cst_streq(aaa,"a"))
 	    r = cons_val(string_val("_a"),0);
 	else
 	    r = cons_val(string_val(aaa),0);
 	cst_free(aaa);
     }
     else if (cst_regex_match(cst_rx_double,name))
     {   /* real numbers */
 	r = en_exp_real(name);
     }
     else if (cst_regex_match(ordinal_number,name))
     {   /* explicit ordinals */
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-2] = '\0';
 	r = en_exp_ordinal(aaa);
 	cst_free(aaa);
     }
     else if ((cst_regex_match(illion,name)) &&
 	     (cst_regex_match(usmoney,ffeature_string(token,"p.name"))))
     {
 	r = cons_val(string_val(name),
 		     cons_val(string_val("dollars"),NULL));
     }
     else if (cst_regex_match(usmoney,name))
     {
 	/* US money */
 /*	printf("money, money, money %s\n", name); */
 	p = strchr(name,'.');

 	if (cst_regex_match(illion,ffeature_string(token,"n.name")))
 	{   /* carl sagan's billions and billions */
 	    r = en_exp_real(&name[1]);
 	}
 	else if (!p)
 	{
 	    aaa = cst_strdup(&name[1]);
 	    if (cst_streq("1",aaa))
 		r = cons_val(string_val("dollar"),NULL);
 	    else
 		r = cons_val(string_val("dollars"),NULL);
 	    r = val_append(us_tokentowords_one(token,aaa),r);
 	    cst_free(aaa);
 	}
 	else if ((cst_strlen(p) == 1) || (cst_strlen(p) > 3))
 	{   /* simply read as mumble point mumble */
 	    r = val_append(en_exp_real(&name[1]),
 			   cons_val(string_val("dollars"),NULL));
 	}
 	else
 	{
 	    aaa = cst_strdup(name);
 	    aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	    for (i=j=0; aaa[i] != '\0'; i++)
 	    {
 		if (aaa[i] != ',')
 		{
 		    aaa[j] = aaa[i];
 		    j++;
 		}
 	    }
 	    aaa[j] = '\0';
 	    if (cst_streq("00",p+1))
 		r = 0;
 	    else if (cst_streq("01",p+1))
 		r = val_append(en_exp_number(p+1),
 			       cons_val(string_val("cent"),NULL));
 	    else
 		r = val_append(en_exp_number(p+1),
 			       cons_val(string_val("cents"),NULL));

 	    if (cst_streq("1",aaa+1))
 		r = cons_val(string_val("dollar"),r);
 	    else
 		r = cons_val(string_val("dollars"),r);

 	    r = val_append(en_exp_number(aaa+1),r);
 	    cst_free(aaa);
 	}
     }
     else if (name[cst_strlen(name)-1] == '%')
     {
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(aaa)-1] = '\0';
 	r = val_append(us_tokentowords_one(token,aaa),
 		       cons_val(string_val("per"),
 				cons_val(string_val("cent"),NULL)));
 	cst_free(aaa);

     }
     else if (cst_regex_match(numess,name))
     {   /* 60s and 7s and 9s */
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-1] = '\0';
 	r = val_append(us_tokentowords_one(token,aaa),
 		       cons_val(string_val("'s"),0));
 	cst_free(aaa);
     }
     else if (contains_unicode_single_quote(name))
     {
         /* A single quote is sometimes rendered as unicode "’" */
         /* so we map it back to an ascii single quote ' */
         aaa = map_unicode_single_quote(name);
         r = us_tokentowords_one(token, aaa);
         cst_free(aaa);
         return r;
     }
     else if ((p=(cst_strrchr(name,'\''))))
     {
 	static const char * const pc[] = { "'s", "'ll", "'ve", "'d", NULL };

 	bbb = cst_downcase(p);
 	if (cst_member_string(bbb, pc))
 	{
 	    aaa = cst_strdup(name);
 	    aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	    r = val_append(us_tokentowords_one(token,aaa),
 			   cons_val(string_val(bbb),0));
 	    cst_free(aaa);
 	}
 	else if (cst_streq(p,"'tve")) /* admittedly rare and weird */
 	{
 	    aaa = cst_strdup(name);
 	    aaa[cst_strlen(name)-cst_strlen(p)+2] = '\0';
 	    r = val_append(us_tokentowords_one(token,aaa),
 			   cons_val(string_val("'ve"),0));
 	    cst_free(aaa);
 	}
 	else
 	{
 	    aaa = cst_strdup(name);
 	    strcpy(&aaa[cst_strlen(name)-cst_strlen(p)],p+1);
 	    r = us_tokentowords_one(token,aaa);
 	    cst_free(aaa);
 	}
 	cst_free(bbb);
     }
     else if ((cst_regex_match(digitsslashdigits,name)) &&
 	     (cst_streq(name,item_name(token))))
     {   /* might be fraction, or not */
 	p=strchr(name,'/');
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	bbb = cst_strdup(p+1);
 	if ((cst_streq("1",aaa)) && (cst_streq("2",bbb)))
 	    r = cons_val(string_val("a"),
 			 cons_val(string_val("half"),0));
 	else if (atoi(aaa) < (atoi(bbb)))
 	{
 	    r = val_append(en_exp_number(aaa),
 			   en_exp_ordinal(bbb));
 	    if (atoi(aaa) > 1)
 		r = val_append(r,cons_val(string_val("'s"),0));
 	}
 	else
 	    r = val_append(en_exp_number(aaa),
 			   cons_val(string_val("slash"),
 				    en_exp_number(bbb)));

 	if ((cst_regex_match(cst_rx_digits,ffeature_string(token,"p.name")))
 	    && (item_prev(token)))  /* don't mistake "0" as a number */
 	    r = cons_val(string_val("and"),r);
 	cst_free(aaa);
 	cst_free(bbb);
     }
     else if ((p=(strchr(name,'-'))))
     {   /* aaa-bbb */
 	aaa = cst_strdup(name);
 	aaa[cst_strlen(name)-cst_strlen(p)] = '\0';
 	bbb = cst_strdup(p+1);
 	if (cst_regex_match(cst_rx_digits,aaa) &&
 	    cst_regex_match(cst_rx_digits,bbb))
 	{
             ccc = cst_strdup(name);
 	    item_set_string(token,"name",bbb);
 	    r = us_tokentowords_one(token,bbb);
 	    item_set_string(token,"name",aaa);
 	    r = val_append(us_tokentowords_one(token,aaa),
 			   cons_val(string_val("to"),r));
 	    item_set_string(token,"name",ccc);
             cst_free(ccc);
 	}
 	else
 	    r = val_append(us_tokentowords_one(token,aaa),
 			   us_tokentowords_one(token,bbb));
 	cst_free(aaa);
 	cst_free(bbb);
     }
     else if (cst_regex_match(wandm,name))
     {   /* weights and measures */
         for (j=cst_strlen(name)-1; j > 0; j--)
             if (cst_strchr("0123456789",name[j]))
                 break;
         j += 1;
         for (i=0; wandm_abbrevs[i][0]; i++)
             if (cst_streq(name+j,wandm_abbrevs[i][0]))
                 break;
         aaa = cst_strdup(name);
         aaa[j] = '\0';
         /* remove any commas */
         for (k=0,l=0; aaa[l]; k++,l++)
         {
             if (aaa[l] == ',') l++;
             aaa[k] = aaa[l];
         }
         aaa[k] = '\0';
         if (!wandm_abbrevs[i][0]) /* didn't find an expansion */
 	    r = val_append(en_exp_number(aaa),
 			   us_tokentowords_one(token,name+j));
         else
             r = val_append(en_exp_number(aaa),
                            cons_val(string_val(wandm_abbrevs[i][1]),NULL));

         cst_free(aaa);
     }
     else if ((cst_strlen(name) > 1) && (!cst_regex_match(cst_rx_alpha,name)))
     {   /* its not just alphas */
 	for (i=0; name[i] != '\0'; i++)
 	    if (text_splitable(name,i))
 		break;
 	aaa = cst_strdup(name);
 	aaa[i+1] = '\0';
 	bbb = cst_strdup(&name[i+1]);
 	item_set_string(token,"nsw","nide");
 	r = val_append(us_tokentowords_one(token,aaa),
 		       us_tokentowords_one(token,bbb));
 	cst_free(aaa);
 	cst_free(bbb);
     }
     else if ((s = state_name(name,token)))
     {
 	r = s;
     }
     else if ((cst_strlen(name) > 1) &&
 	     (cst_regex_match(cst_rx_alpha,name)) &&
              (!in_lex(lex,name,NULL,NULL)) &&  // AUP: Added 4th argument (voice feats) as NULL, needs to be revisited later.
 	     (!us_aswd(name)))
         /* Still not quiet right, if there is a user_lex we need to check */
         /* it too -- but user_lex isn't user setable yet */
 	/* Need common exception list */
 	/* unpronouncable list of alphas */
 	r = en_exp_letters(name);

     /* buckets of other stuff missing */

     else  /* just a word */
     {
 	aaa = cst_downcase(name);
 	r = cons_val(string_val(aaa),0);
 	cst_free(aaa);
     }
     return r;
 }

 static int text_splitable(const char *s,int i)
 {
     /* should token be split after this */

     if (strchr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",s[i]) &&
 	strchr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",s[i+1]))
 	return FALSE;
     else if (strchr("0123456789",s[i]) &&
 	     strchr("0123456789",s[i+1]))
 	return FALSE;
     else
 	return TRUE;
 }

 static const char * const states[99][5] =
 {
   { "AL", "ambiguous", "alabama" , NULL, NULL },
   { "Al", "ambiguous", "alabama" , NULL, NULL },
   { "Ala", "", "alabama" , NULL, NULL },
   { "AK", "", "alaska" , NULL, NULL },
   { "Ak", "", "alaska" , NULL, NULL },
   { "AZ", "", "arizona" , NULL, NULL },
   { "Az", "", "arizona" , NULL, NULL },
   { "CA", "", "california" , NULL, NULL },
   { "Ca", "", "california" , NULL, NULL },
   { "Cal", "ambiguous", "california" , NULL, NULL },
   { "Calif", "", "california" , NULL, NULL },
   { "CO", "ambiguous", "colorado" , NULL, NULL },
   { "Co", "ambiguous", "colorado" , NULL, NULL },
   { "Colo", "", "colorado" , NULL, NULL },
   { "DC", "", "d" , "c", NULL },
   { "DE", "", "delaware" , NULL, NULL },
   { "De", "ambiguous", "delaware" , NULL, NULL },
   { "Del", "ambiguous", "delaware" , NULL, NULL },
   { "FL", "", "florida" , NULL, NULL },
   { "Fl", "ambiguous", "florida" , NULL, NULL },
   { "Fla", "", "florida" , NULL, NULL },
   { "GA", "", "georgia" , NULL, NULL },
   { "Ga", "", "georgia" , NULL, NULL },
   { "HI", "", "hawaii" , NULL, NULL },
   { "Hi", "ambiguous", "hawaii" , NULL, NULL },
   { "IA", "", "iowa" , NULL, NULL },
   { "Ia", "ambiguous", "iowa" , NULL, NULL },
   { "Ind", "ambiguous", "indiana" , NULL, NULL },
   { "ID", "ambiguous", "idaho" , NULL, NULL },
   { "IL", "ambiguous", "illinois" , NULL, NULL },
   { "Il", "ambiguous", "illinois" , NULL, NULL },
   { "ILL", "ambiguous", "illinois" , NULL, NULL },
   { "KS", "", "kansas" , NULL, NULL },
   { "Ks", "", "kansas" , NULL, NULL },
   { "Kans", "", "kansas" , NULL, NULL },
   { "KY", "ambiguous", "kentucky" , NULL, NULL },
   { "Ky", "ambiguous", "kentucky" , NULL, NULL },
   { "LA", "ambiguous", "louisiana" , NULL, NULL },
   { "La", "ambiguous", "louisiana" , NULL, NULL },
   { "Lou", "ambiguous", "louisiana" , NULL, NULL },
   { "Lous", "ambiguous", "louisiana" , NULL, NULL },
   { "MA", "ambiguous", "massachusetts" , NULL, NULL },
   { "Mass", "ambiguous", "massachusetts" , NULL, NULL },
   { "Ma", "ambiguous", "massachusetts" , NULL, NULL },
   { "MD", "ambiguous", "maryland" , NULL, NULL },
   { "Md", "ambiguous", "maryland" , NULL, NULL },
   { "ME", "ambiguous", "maine" , NULL, NULL },
   { "Me", "ambiguous", "maine" , NULL, NULL },
   { "MI", "", "michigan" , NULL, NULL },
   { "Mi", "ambiguous", "michigan" , NULL, NULL },
   { "Mich", "ambiguous", "michigan" , NULL, NULL },
   { "MN", "ambiguous", "minnestota" , NULL, NULL },
   { "Minn", "ambiguous", "minnestota" , NULL, NULL },
   { "MS", "ambiguous", "mississippi" , NULL, NULL },
   { "Miss", "ambiguous", "mississippi" , NULL, NULL },
   { "MT", "ambiguous", "montanna" , NULL, NULL },
   { "Mt", "ambiguous", "montanna" , NULL, NULL },
   { "MO", "ambiguous", "missouri" , NULL, NULL },
   { "Mo", "ambiguous", "missouri" , NULL, NULL },
   { "NC", "ambiguous", "north" , "carolina", NULL },
   { "ND", "ambiguous", "north" , "dakota", NULL },
   { "NE", "ambiguous", "nebraska" , NULL, NULL },
   { "Ne", "ambiguous", "nebraska" , NULL, NULL },
   { "Neb", "ambiguous", "nebraska" , NULL, NULL },
   { "NH", "ambiguous", "new" , "hampshire", NULL },
   { "NV", "", "nevada" , NULL, NULL },
   { "Nev", "", "nevada" , NULL, NULL },
   { "NY", "", "new" , "york", NULL },
   { "OH", "ambiguous", "ohio" , NULL, NULL },
   { "OK", "ambiguous", "oklahoma" , NULL, NULL },
   { "Okla", "", "oklahoma" , NULL, NULL },
   { "OR", "ambiguous", "oregon" , NULL, NULL },
   { "Or", "ambiguous", "oregon" , NULL, NULL },
   { "Ore", "ambiguous", "oregon" , NULL, NULL },
   { "PA", "ambiguous", "pennsylvania" , NULL, NULL },
   { "Pa", "ambiguous", "pennsylvania" , NULL, NULL },
   { "Penn", "ambiguous", "pennsylvania" , NULL, NULL },
   { "RI", "ambiguous", "rhode" , "island", NULL },
   { "SC", "ambiguous", "south" , "carlolina", NULL },
   { "SD", "ambiguous", "south" , "dakota", NULL },
   { "TN", "ambiguous", "tennesee" , NULL, NULL },
   { "Tn", "ambiguous", "tennesee" , NULL, NULL },
   { "Tenn", "ambiguous", "tennesee" , NULL, NULL },
   { "TX", "ambiguous", "texas" , NULL, NULL },
   { "Tx", "ambiguous", "texas" , NULL, NULL },
   { "Tex", "ambiguous", "texas" , NULL, NULL },
   { "UT", "ambiguous", "utah" , NULL, NULL },
   { "VA", "ambiguous", "virginia" , NULL, NULL },
   { "WA", "ambiguous", "washington" , NULL, NULL },
   { "Wa", "ambiguous", "washington" , NULL, NULL },
   { "Wash", "ambiguous", "washington" , NULL, NULL },
   { "WI", "ambiguous", "wisconsin" , NULL, NULL },
   { "Wi", "ambiguous", "wisconsin" , NULL, NULL },
   { "WV", "ambiguous", "west" , "virginia", NULL },
   { "WY", "ambiguous", "wyoming" , NULL, NULL },
   { "Wy", "ambiguous", "wyoming" , NULL, NULL },
   { "Wyo", "", "wyoming" , NULL, NULL },
   { "PR", "ambiguous", "puerto" , "rico", NULL },
   { NULL, NULL, "puerto" , "rico", NULL }
 };

 static cst_val *state_name(const char *name,cst_item *t)
 {
     int s,j;
     int do_it = 0;
     cst_val *r = 0;

     for (s=0; states[s][0]; s++)
     {
 	if (cst_streq(states[s][0],name))
 	{
 	    if (cst_streq(states[s][1],"ambiguous"))
 	    {
 		const char *pname = ffeature_string(t,"p.name");
 		const char *nname = ffeature_string(t,"n.name");
 		    /* previous name is capitalized */
 		if (((strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",pname[0])) &&
 		     (cst_strlen(pname) > 2) &&
 		     (cst_regex_match(cst_rx_alpha,pname))) &&
 		    ((strchr("abcdefghijklmnopqrstuvwxyz",nname[0])) ||
 		     (item_next(t) == 0) ||
 		     (cst_streq(".",item_feat_string(t,"punc"))) ||
 		     (((cst_strlen(nname) == 5 || (cst_strlen(nname) == 10)) &&
 		       cst_regex_match(cst_rx_digits,nname)))))
 		    do_it = 1;
 		else
 		    do_it = 0;
 	    }
 	    else
 		do_it = 1;

 	    if (do_it)
 	    {
 		for (j=2; states[s][j]; j++)
 		    r = cons_val(string_val(states[s][j]),r);
 		return val_reverse(r);
 	    }
 	}
     }
     return r;

 }