src/synth/cst_synth.c - third_party/flite - Git at Google

 /*************************************************************************/
 /*                                                                       */
 /*                  Language Technologies Institute                      */
 /*                     Carnegie Mellon University                        */
 /*                         Copyright (c) 2000                            */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
 /*               Date:  September 2000                                   */
 /*************************************************************************/
 /*                                                                       */
 /*  General synthesis control                                            */
 /*                                                                       */
 /*************************************************************************/

 #include "cst_hrg.h"
 #include "cst_cart.h"
 #include "cst_tokenstream.h"
 #include "cst_utt_utils.h"
 #include "cst_lexicon.h"
 #include "cst_units.h"
 #include "cst_synth.h"
 #include "cst_phoneset.h"

 CST_VAL_REGISTER_FUNCPTR(breakfunc,cst_breakfunc)

 #ifndef SYNTH_MODULES_DEBUG
 #define SYNTH_MODULES_DEBUG 0
 #endif

 #if SYNTH_MODULES_DEBUG > 0
 #define DPRINTF(l,x) if (SYNTH_MODULES_DEBUG > l) cst_dbgmsg x
 #else
 #define DPRINTF(l,x)
 #endif

 static cst_utterance *tokentosegs(cst_utterance *u);

 static const cst_synth_module synth_method_text[] = {
     { "tokenizer_func", default_tokenization },
     { "textanalysis_func", default_textanalysis },
     { "pos_tagger_func", default_pos_tagger },
     { "phrasing_func", default_phrasing },
     { "lexical_insertion_func", default_lexical_insertion },
     { "pause_insertion_func", default_pause_insertion },
     { "intonation_func", cart_intonation },
     { "postlex_func", NULL },
     { "duration_model_func", cart_duration },
     { "f0_model_func", NULL },
     { "wave_synth_func", NULL },
     { "post_synth_hook_func", NULL },
     { NULL, NULL }
 };

 static const cst_synth_module synth_method_text2segs[] = {
     { "tokenizer_func", default_tokenization },
     { "textanalysis_func", default_textanalysis },
     { "pos_tagger_func", default_pos_tagger },
     { "phrasing_func", default_phrasing },
     { "lexical_insertion_func", default_lexical_insertion },
     { "pause_insertion_func", default_pause_insertion },
     { NULL, NULL }
 };

 static const cst_synth_module synth_method_tokens[] = {
     { "textanalysis_func", default_textanalysis },
     { "pos_tagger_func", default_pos_tagger },
     { "phrasing_func", default_phrasing },
     { "lexical_insertion_func", default_lexical_insertion },
     { "pause_insertion_func", default_pause_insertion },
     { "intonation_func", cart_intonation },
     { "postlex_func", NULL },
     { "duration_model_func", cart_duration },
     { "f0_model_func", NULL },
     { "wave_synth_func", NULL },
     { "post_synth_hook_func", NULL },
     { NULL, NULL }
 };

 static const cst_synth_module synth_method_phones[] = {
     { "tokenizer_func", default_tokenization },
     { "textanalysis_func", tokentosegs },
     { "pos_tagger_func", default_pos_tagger },
     { "intonation_func", NULL },
     { "duration_model_func", cart_duration },
     { "f0_model_func", flat_prosody },
     { "wave_synth_func", NULL },
     { "post_synth_hook_func", NULL },
     { NULL, NULL }
 };

 cst_utterance *utt_synth_wave(cst_wave *w,cst_voice *v)
 {
     /* Create an utterance with a wave in it as if we've synthesized it */
     /* Put it through streaming if that is require */
     cst_utterance *u;
     const cst_val *streaming_info_val;
     cst_audio_streaming_info *asi = NULL;

     u = new_utterance();
     utt_init(u,v);
     utt_set_wave(u,w);

     streaming_info_val=get_param_val(u->features,"streaming_info",NULL);
     if (streaming_info_val)
     {
         asi = val_audio_streaming_info(streaming_info_val);
         asi->utt = u;
     }

     if (!asi) return u;  /* no stream */

     /* Do streaming */
     (*asi->asc)(w,0,w->num_samples,1,asi);

     return u;
 }

 cst_utterance *apply_synth_module(cst_utterance *u,
 				  const cst_synth_module *mod)
 {
     const cst_val *v;

     v = feat_val(u->features, mod->hookname);
     if (v)
 	return (*val_uttfunc(v))(u);
     if (mod->defhook)
 	return (*mod->defhook)(u);
     return u;
 }

 cst_utterance *apply_synth_method(cst_utterance *u,
 				  const cst_synth_module meth[])
 {
     while (meth->hookname)
     {
 	if ((u = apply_synth_module(u, meth)) == NULL)
 	    return NULL;
 	++meth;
     }

     return u;
 }

 cst_utterance *utt_init(cst_utterance *u, cst_voice *vox)
 {
     /* Link the vox features into the utterance features so the voice  */
     /* features will be searched too (after the utt ones)              */
     feat_link_into(vox->features,u->features);
     feat_link_into(vox->ffunctions,u->ffunctions);

     /* Do the initialization function, if there is one */
     if (vox->utt_init)
 	vox->utt_init(u, vox);

     return u;
 }

 cst_utterance *utt_synth(cst_utterance *u)
 {
     return apply_synth_method(u, synth_method_text);
 }

 cst_utterance *utt_synth_tokens(cst_utterance *u)
 {
     return apply_synth_method(u, synth_method_tokens);
 }

 cst_utterance *utt_synth_text2segs(cst_utterance *u)
 {
     return apply_synth_method(u, synth_method_text2segs);
 }

 cst_utterance *utt_synth_phones(cst_utterance *u)
 {
     return apply_synth_method(u, synth_method_phones);
 }

 cst_utterance *default_tokenization(cst_utterance *u)
 {
     const char *text,*token;
     cst_tokenstream *fd;
     cst_item *t;
     cst_relation *r;

     text = utt_input_text(u);
     r = utt_relation_create(u,"Token");
     fd = ts_open_string(text,
 	get_param_string(u->features,"text_whitespace",NULL),
 	get_param_string(u->features,"text_singlecharsymbols",NULL),
 	get_param_string(u->features,"text_prepunctuation",NULL),
         get_param_string(u->features,"text_postpunctuation",NULL));

     while(!ts_eof(fd))
     {
 	token = ts_get(fd);
 	if (cst_strlen(token) > 0)
 	{
 	    t = relation_append(r,NULL);
 	    item_set_string(t,"name",token);
 	    item_set_string(t,"whitespace",fd->whitespace);
 	    item_set_string(t,"prepunctuation",fd->prepunctuation);
 	    item_set_string(t,"punc",fd->postpunctuation);
 	    item_set_int(t,"file_pos",fd->file_pos);
 	    item_set_int(t,"line_number",fd->line_number);
 	}
     }

     ts_close(fd);

     return u;
 }

 cst_val *default_tokentowords(cst_item *i)
 {
     return cons_val(string_val(item_feat_string(i,"name")), NULL);
 }

 cst_utterance *default_textanalysis(cst_utterance *u)
 {
     cst_item *t,*word;
     cst_relation *word_rel;
     cst_val *words;
     const cst_val *w;
     const cst_val *ttwv;

     word_rel = utt_relation_create(u,"Word");
     ttwv = feat_val(u->features, "tokentowords_func");

     for (t=relation_head(utt_relation(u,"Token")); t; t=item_next(t))
     {
 	if (ttwv)
 	    words = (cst_val *)(*val_itemfunc(ttwv))(t);
 	else
 	    words = default_tokentowords(t);

 	for (w=words; w; w=val_cdr(w))
 	{
 	    word = item_add_daughter(t,NULL);
 	    if (cst_val_consp(val_car(w)))
 	    {   /* Has extra features */
 		item_set_string(word,"name",val_string(val_car(val_car(w))));
 		feat_copy_into(val_features(val_cdr(val_car(w))),
 			       item_feats(word));
 	    }
 	    else
 		item_set_string(word,"name",val_string(val_car(w)));
 	    relation_append(word_rel,word);
 	}
 	delete_val(words);
     }

     return u;
 }

 cst_utterance *default_phrasing(cst_utterance *u)
 {
     cst_relation *r;
     cst_item *w, *p, *lp=NULL;
     const cst_val *v;
     cst_cart *phrasing_cart;

     r = utt_relation_create(u,"Phrase");
     if (feat_present(u->features,"phrasing_cart"))
         phrasing_cart = val_cart(feat_val(u->features,"phrasing_cart"));
     else
         phrasing_cart = NULL;

     for (p=NULL,w=relation_head(utt_relation(u,"Word")); w; w=item_next(w))
     {
 	if (p == NULL)
 	{
 	    p = relation_append(r,NULL);
             lp = p;
             item_set_string(p,"name","B");
 	}
 	item_add_daughter(p,w);
         if (phrasing_cart)
         {
             v = cart_interpret(w,phrasing_cart);
             if (cst_streq(val_string(v),"BB"))
                 p = NULL;
         }
     }

     if (lp && item_prev(lp)) /* follow festival */
         item_set_string(lp,"name","BB");

     return u;
 }

 cst_utterance *default_pause_insertion(cst_utterance *u)
 {
     /* Add initial silences and silence at each phrase break */
     const char *silence;
     const cst_item *w;
     cst_item *p, *s;

     silence = val_string(feat_val(u->features,"silence"));

     /* Insert initial silence */
     s = relation_head(utt_relation(u,"Segment"));
     if (s == NULL)
 	s = relation_append(utt_relation(u,"Segment"),NULL);
     else
 	s = item_prepend(s,NULL);
     item_set_string(s,"name",silence);

     for (p=relation_head(utt_relation(u,"Phrase")); p; p=item_next(p))
     {
 	for (w = item_last_daughter(p); w; w=item_prev(w))
 	{
 	    s = path_to_item(w,"R:SylStructure.daughtern.daughtern.R:Segment");
 	    if (s)
 	    {
 		s = item_append(s,NULL);
 		item_set_string(s,"name",silence);
 		break;
 	    }
 	}
     }

     return u;
 }

 cst_utterance *cart_intonation(cst_utterance *u)
 {
     cst_cart *accents, *tones;
     cst_item *s;
     const cst_val *v;

     if (feat_present(u->features,"no_intonation_accent_model"))
         return u;  /* not all languages have intonation models */

     accents = val_cart(feat_val(u->features,"int_cart_accents"));
     tones = val_cart(feat_val(u->features,"int_cart_tones"));

     for (s=relation_head(utt_relation(u,"Syllable")); s; s=item_next(s))
     {
 	v = cart_interpret(s,accents);
 	if (!cst_streq("NONE",val_string(v)))
 	    item_set_string(s,"accent",val_string(v));
 	v = cart_interpret(s,tones);
 	if (!cst_streq("NONE",val_string(v)))
 	    item_set_string(s,"endtone",val_string(v));
 	DPRINTF(0,("word %s gpos %s stress %s ssyl_in %s ssyl_out %s accent %s endtone %s\n",
 		   ffeature_string(s,"R:SylStructure.parent.name"),
 		   ffeature_string(s,"R:SylStructure.parent.gpos"),
 		   ffeature_string(s,"stress"),
 		   ffeature_string(s,"ssyl_in"),
 		   ffeature_string(s,"ssyl_out"),
 		   ffeature_string(s,"accent"),
 		   ffeature_string(s,"endtone")));
     }

     return u;
 }

 CST_VAL_REGISTER_TYPE_NODEL(dur_stats,dur_stats)

 const dur_stat *phone_dur_stat(const dur_stats *ds,const char *ph)
 {
     int i;
     for (i=0; ds[i]; i++)
 	if (cst_streq(ph,ds[i]->phone))
             return ds[i];

     return ds[0];
 }

 cst_utterance *cart_duration(cst_utterance *u)
 {
     cst_cart *dur_tree;
     cst_item *s;
     float zdur, dur_stretch, local_dur_stretch, dur;
     float end;
     dur_stats *ds;
     const dur_stat *dur_stat;

     end = 0;

     if (feat_present(u->features,"no_segment_duration_model"))
         return u;  /* not all methods need segment durations */

     dur_tree = val_cart(feat_val(u->features,"dur_cart"));
     dur_stretch = get_param_float(u->features,"duration_stretch", 1.0);
     ds = val_dur_stats(feat_val(u->features,"dur_stats"));

     for (s=relation_head(utt_relation(u,"Segment")); s; s=item_next(s))
     {
 	zdur = val_float(cart_interpret(s,dur_tree));
 	dur_stat = phone_dur_stat(ds,item_name(s));

 	local_dur_stretch = ffeature_float(s, "R:SylStructure.parent.parent."
 					   "R:Token.parent.local_duration_stretch");
 	if (local_dur_stretch)
 	    local_dur_stretch *= dur_stretch;
 	else
 	    local_dur_stretch = dur_stretch;

 	dur = local_dur_stretch * ((zdur*dur_stat->stddev)+dur_stat->mean);
 	DPRINTF(0,("phone %s accent %s stress %s pdur %f stretch %f mean %f std %f dur %f\n",
 		   item_name(s),
 		   ffeature_string(s,"R:SylStructure.parent.accented"),
 		   ffeature_string(s,"R:SylStructure.parent.stress"),
 		   zdur, local_dur_stretch, dur_stat->mean,
 		   dur_stat->stddev, dur));
 	end += dur;
 	item_set_float(s,"end",end);
     }
     return u;
 }

 cst_utterance *default_pos_tagger(cst_utterance *u)
 {
     cst_item *word;
     const cst_val *p;
     const cst_cart *tagger;

     p = get_param_val(u->features,"pos_tagger_cart",NULL);
     if (p == NULL)
         return u;
     tagger = val_cart(p);

     for (word=relation_head(utt_relation(u,"Word"));
 	 word; word=item_next(word))
     {
         p = cart_interpret(word,tagger);
         item_set_string(word,"pos",val_string(p));
     }

     return u;
 }

 cst_utterance *default_lexical_insertion(cst_utterance *u)
 {
     cst_item *word;
     cst_relation *sylstructure,*seg,*syl;
     cst_lexicon *lex;
     const cst_val *lex_addenda = NULL;
     const cst_val *p, *wp = NULL;
     char *phone_name;
     const char *stress = "0";
     const char *pos;
     cst_val *phones;
     cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl;
     const cst_val *vpn;
     int dp = 0;

     lex = val_lexicon(feat_val(u->features,"lexicon"));
     if (lex->lex_addenda)
 	lex_addenda = lex->lex_addenda;

     syl = utt_relation_create(u,"Syllable");
     sylstructure = utt_relation_create(u,"SylStructure");
     seg = utt_relation_create(u,"Segment");

     for (word=relation_head(utt_relation(u,"Word"));
 	 word; word=item_next(word))
     {
 	ssword = relation_append(sylstructure,word);
         pos = ffeature_string(word,"pos");
 	phones = NULL;
         wp = NULL;
         dp = 0;  /* should the phones get deleted or not */

         /*        printf("awb_debug word %s pos %s gpos %s\n",
                item_feat_string(word,"name"),
                pos,
                ffeature_string(word,"gpos")); */

 	/* FIXME: need to make sure that textanalysis won't split
            tokens with explicit pronunciation (or that it will
            propagate such to words, then we can remove the path here) */
 	if (item_feat_present(item_parent(item_as(word, "Token")), "phones"))
         {
             vpn = item_feat(item_parent(item_as(word, "Token")), "phones");
             if (cst_val_consp(vpn))
             {   /* for SAPI ?? */
                 /* awb oct11: this seems wrong -- */
                 /* not sure SAPI still (ever) works Oct11 */
                 phones = (cst_val *) vpn;
             }
             else
             {
                 dp = 1;
                 if (cst_streq(val_string(vpn),
                               ffeature_string(word,"p.R:Token.parent.phones")))
                     phones = NULL; /* Already given these phones */
                 else
                     phones = val_readlist_string(val_string(vpn));
             }
         }
 	else
 	{
             wp = val_assoc_string(item_feat_string(word, "name"),lex_addenda);
             if (wp)
                 phones = (cst_val *)val_cdr(val_cdr(wp));
             else
             {
                 dp = 1;
 		phones = lex_lookup(lex,item_feat_string(word,"name"),pos,
                                     u->features);
             }
 	}

 	for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p))
 	{
 	    if (sylitem == NULL)
 	    {
 		sylitem = relation_append(syl,NULL);
 		sssyl = item_add_daughter(ssword,sylitem);
 		stress = "0";
 	    }
 	    segitem = relation_append(seg,NULL);
 	    phone_name = cst_strdup(val_string(val_car(p)));
 	    if (phone_name[cst_strlen(phone_name)-1] == '1')
 	    {
 		stress = "1";
 		phone_name[cst_strlen(phone_name)-1] = '\0';
 	    }
 	    else if (phone_name[cst_strlen(phone_name)-1] == '0')
 	    {
 		stress = "0";
 		phone_name[cst_strlen(phone_name)-1] = '\0';
 	    }
 	    item_set_string(segitem,"name",phone_name);
 	    seg_in_syl = item_add_daughter(sssyl,segitem);
 #if 0
             printf("awb_debug ph %s\n",phone_name);
 #endif
 	    if ((lex->syl_boundary)(seg_in_syl,val_cdr(p)))
 	    {
 #if 0
                 printf("awb_debug SYL\n");
 #endif
 		sylitem = NULL;
 		if (sssyl)
 		    item_set_string(sssyl,"stress",stress);
 	    }
 	    cst_free(phone_name);
 	}
 	if (dp)
         {
 	    delete_val(phones);
             phones = NULL;
         }
     }

     return u;
 }

 /* Dummy F0 modelling for phones, copied directly from us_f0_model.c */
 cst_utterance *flat_prosody(cst_utterance *u)
 {
     /* F0 target model */
     cst_item *s,*t;
     cst_relation *targ_rel;
     float mean, stddev;

     targ_rel = utt_relation_create(u,"Target");
     mean = get_param_float(u->features,"target_f0_mean", 100.0);
     mean *= get_param_float(u->features,"f0_shift", 1.0);
     stddev = get_param_float(u->features,"target_f0_stddev", 12.0);

     s=relation_head(utt_relation(u,"Segment"));
     t = relation_append(targ_rel,NULL);
     item_set_float(t,"pos",0.0);
     item_set_float(t,"f0",mean+stddev);

     s=relation_tail(utt_relation(u,"Segment"));
     t = relation_append(targ_rel,NULL);

     item_set_float(t,"pos",item_feat_float(s,"end"));
     item_set_float(t,"f0",mean-stddev);

     return u;
 }

 static cst_utterance *tokentosegs(cst_utterance *u)
 {
     cst_item *t;
     cst_relation *seg, *syl, *sylstructure, *word;
     cst_item *sylitem, *sylstructureitem, *worditem, *sssyl;
     cst_phoneset *ps;

     ps = val_phoneset(utt_feat_val(u, "phoneset"));
     /* Just copy tokens into the Segment relation */
     seg = utt_relation_create(u, "Segment");
     syl = utt_relation_create(u, "Syllable");
     word = utt_relation_create(u, "Word");
     sylstructure = utt_relation_create(u, "SylStructure");
     sssyl = sylitem = worditem = sylstructureitem = 0;
     for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t))
     {
 	cst_item *segitem = relation_append(seg, NULL);
 	char const *pname = item_feat_string(t, "name");
 	char *name = cst_strdup(pname);

 	if (worditem == 0)
 	{
 	    worditem = relation_append(word,NULL);
 	    item_set_string(worditem, "name", "phonestring");
 	    sylstructureitem = relation_append(sylstructure,worditem);
 	}
 	if (sylitem == 0)
 	{
 	    sylitem = relation_append(syl,NULL);
 	    sssyl = item_add_daughter(sylstructureitem,sylitem);
 	}

 	if (name[cst_strlen(name)-1] == '1')
 	{
 	    item_set_string(sssyl,"stress","1");
 	    name[cst_strlen(name)-1] = '\0';
 	}
 	else if (name[cst_strlen(name)-1] == '0')
 	{
 	    item_set_string(sssyl,"stress","0");
 	    name[cst_strlen(name)-1] = '\0';
 	}

 	if (cst_streq(name,"-"))
 	{
 	    sylitem = 0;  /* syllable break */
 	}
 	else if (phone_id(ps, name) == -1)
 	{
 	    cst_errmsg("Phone `%s' not in phoneset\n", pname);
 	    cst_error();
 	}
 	else
 	{
 	    item_add_daughter(sssyl,segitem);
 	    item_set_string(segitem, "name", name);
 	}

 	cst_free(name);
     }

     return u;
 }

 int default_utt_break(cst_tokenstream *ts,
 		      const char *token,
 		      cst_relation *tokens)
 {
     /* This is the default utt break functions, languages may override this */
     /* This will be ok for some latin based languages */
     const char *postpunct = item_feat_string(relation_tail(tokens), "punc");
     const char *ltoken = item_name(relation_tail(tokens));

     if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n'))
 	 /* contains two new lines */
 	 return TRUE;
     /* Well, this is a little specific isn't it. */
     else if (((cst_streq(ltoken,"Yahoo")) ||
               (cst_streq(ltoken,"YAHOO")) ||
               (cst_streq(ltoken,"yahoo"))) &&
              strchr(postpunct,'!') &&
 	     strchr("abcdefghijklmnopqrstuvwxyz",token[0]))
         return FALSE;
     else if (strchr(postpunct,':') ||
 	     strchr(postpunct,'?') ||
 	     strchr(postpunct,'!'))
 	return TRUE;
     else if (strchr(postpunct,'.') &&
 	     (cst_strlen(ts->whitespace) > 1) &&
 	     strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]))
 	return TRUE;
     else if (strchr(postpunct,'.') &&
 	     /* next word starts with a capital */
 	     strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]) &&
 	     /* last word isn't an abbreviation */
 	     !(strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[cst_strlen(ltoken)-1])||
 	       ((cst_strlen(ltoken) < 4) &&
 		strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[0]))))
 	return TRUE;
     else
 	return FALSE;
 }
	/*************************************************************************/
	/* */
	/* Language Technologies Institute */
	/* Carnegie Mellon University */
	/* Copyright (c) 2000 */
	/* All Rights Reserved. */
	/* */
	/* Permission is hereby granted, free of charge, to use and distribute */
	/* this software and its documentation without restriction, including */
	/* without limitation the rights to use, copy, modify, merge, publish, */
	/* distribute, sublicense, and/or sell copies of this work, and to */
	/* permit persons to whom this work is furnished to do so, subject to */
	/* the following conditions: */
	/* 1. The code must retain the above copyright notice, this list of */
	/* conditions and the following disclaimer. */
	/* 2. Any modifications must be clearly marked as such. */
	/* 3. Original authors' names are not deleted. */
	/* 4. The authors' names are not used to endorse or promote products */
	/* derived from this software without specific prior written */
	/* permission. */
	/* */
	/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
	/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
	/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
	/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
	/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
	/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
	/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
	/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
	/* THIS SOFTWARE. */
	/* */
	/*************************************************************************/
	/* Author: Alan W Black (awb@cs.cmu.edu) */
	/* Date: September 2000 */
	/*************************************************************************/
	/* */
	/* General synthesis control */
	/* */
	/*************************************************************************/

	#include "cst_hrg.h"
	#include "cst_cart.h"
	#include "cst_tokenstream.h"
	#include "cst_utt_utils.h"
	#include "cst_lexicon.h"
	#include "cst_units.h"
	#include "cst_synth.h"
	#include "cst_phoneset.h"

	CST_VAL_REGISTER_FUNCPTR(breakfunc,cst_breakfunc)

	#ifndef SYNTH_MODULES_DEBUG
	#define SYNTH_MODULES_DEBUG 0
	#endif

	#if SYNTH_MODULES_DEBUG > 0
	#define DPRINTF(l,x) if (SYNTH_MODULES_DEBUG > l) cst_dbgmsg x
	#else
	#define DPRINTF(l,x)
	#endif

	static cst_utterance tokentosegs(cst_utterance u);

	static const cst_synth_module synth_method_text[] = {
	{ "tokenizer_func", default_tokenization },
	{ "textanalysis_func", default_textanalysis },
	{ "pos_tagger_func", default_pos_tagger },
	{ "phrasing_func", default_phrasing },
	{ "lexical_insertion_func", default_lexical_insertion },
	{ "pause_insertion_func", default_pause_insertion },
	{ "intonation_func", cart_intonation },
	{ "postlex_func", NULL },
	{ "duration_model_func", cart_duration },
	{ "f0_model_func", NULL },
	{ "wave_synth_func", NULL },
	{ "post_synth_hook_func", NULL },
	{ NULL, NULL }
	};

	static const cst_synth_module synth_method_text2segs[] = {
	{ "tokenizer_func", default_tokenization },
	{ "textanalysis_func", default_textanalysis },
	{ "pos_tagger_func", default_pos_tagger },
	{ "phrasing_func", default_phrasing },
	{ "lexical_insertion_func", default_lexical_insertion },
	{ "pause_insertion_func", default_pause_insertion },
	{ NULL, NULL }
	};

	static const cst_synth_module synth_method_tokens[] = {
	{ "textanalysis_func", default_textanalysis },
	{ "pos_tagger_func", default_pos_tagger },
	{ "phrasing_func", default_phrasing },
	{ "lexical_insertion_func", default_lexical_insertion },
	{ "pause_insertion_func", default_pause_insertion },
	{ "intonation_func", cart_intonation },
	{ "postlex_func", NULL },
	{ "duration_model_func", cart_duration },
	{ "f0_model_func", NULL },
	{ "wave_synth_func", NULL },
	{ "post_synth_hook_func", NULL },
	{ NULL, NULL }
	};

	static const cst_synth_module synth_method_phones[] = {
	{ "tokenizer_func", default_tokenization },
	{ "textanalysis_func", tokentosegs },
	{ "pos_tagger_func", default_pos_tagger },
	{ "intonation_func", NULL },
	{ "duration_model_func", cart_duration },
	{ "f0_model_func", flat_prosody },
	{ "wave_synth_func", NULL },
	{ "post_synth_hook_func", NULL },
	{ NULL, NULL }
	};

	cst_utterance utt_synth_wave(cst_wave w,cst_voice *v)
	{
	/* Create an utterance with a wave in it as if we've synthesized it */
	/* Put it through streaming if that is require */
	cst_utterance *u;
	const cst_val *streaming_info_val;
	cst_audio_streaming_info *asi = NULL;

	u = new_utterance();
	utt_init(u,v);
	utt_set_wave(u,w);

	streaming_info_val=get_param_val(u->features,"streaming_info",NULL);
	if (streaming_info_val)
	{
	asi = val_audio_streaming_info(streaming_info_val);
	asi->utt = u;
	}

	if (!asi) return u; /* no stream */

	/* Do streaming */
	(*asi->asc)(w,0,w->num_samples,1,asi);

	return u;
	}

	cst_utterance apply_synth_module(cst_utterance u,
	const cst_synth_module *mod)
	{
	const cst_val *v;

	v = feat_val(u->features, mod->hookname);
	if (v)
	return (*val_uttfunc(v))(u);
	if (mod->defhook)
	return (*mod->defhook)(u);
	return u;
	}

	cst_utterance apply_synth_method(cst_utterance u,
	const cst_synth_module meth[])
	{
	while (meth->hookname)
	{
	if ((u = apply_synth_module(u, meth)) == NULL)
	return NULL;
	++meth;
	}

	return u;
	}

	cst_utterance utt_init(cst_utterance u, cst_voice *vox)
	{
	/* Link the vox features into the utterance features so the voice */
	/* features will be searched too (after the utt ones) */
	feat_link_into(vox->features,u->features);
	feat_link_into(vox->ffunctions,u->ffunctions);

	/* Do the initialization function, if there is one */
	if (vox->utt_init)
	vox->utt_init(u, vox);

	return u;
	}

	cst_utterance utt_synth(cst_utterance u)
	{
	return apply_synth_method(u, synth_method_text);
	}

	cst_utterance utt_synth_tokens(cst_utterance u)
	{
	return apply_synth_method(u, synth_method_tokens);
	}

	cst_utterance utt_synth_text2segs(cst_utterance u)
	{
	return apply_synth_method(u, synth_method_text2segs);
	}

	cst_utterance utt_synth_phones(cst_utterance u)
	{
	return apply_synth_method(u, synth_method_phones);
	}

	cst_utterance default_tokenization(cst_utterance u)
	{
	const char text,token;
	cst_tokenstream *fd;
	cst_item *t;
	cst_relation *r;

	text = utt_input_text(u);
	r = utt_relation_create(u,"Token");
	fd = ts_open_string(text,
	get_param_string(u->features,"text_whitespace",NULL),
	get_param_string(u->features,"text_singlecharsymbols",NULL),
	get_param_string(u->features,"text_prepunctuation",NULL),
	get_param_string(u->features,"text_postpunctuation",NULL));

	while(!ts_eof(fd))
	{
	token = ts_get(fd);
	if (cst_strlen(token) > 0)
	{
	t = relation_append(r,NULL);
	item_set_string(t,"name",token);
	item_set_string(t,"whitespace",fd->whitespace);
	item_set_string(t,"prepunctuation",fd->prepunctuation);
	item_set_string(t,"punc",fd->postpunctuation);
	item_set_int(t,"file_pos",fd->file_pos);
	item_set_int(t,"line_number",fd->line_number);
	}
	}

	ts_close(fd);

	return u;
	}

	cst_val default_tokentowords(cst_item i)
	{
	return cons_val(string_val(item_feat_string(i,"name")), NULL);
	}

	cst_utterance default_textanalysis(cst_utterance u)
	{
	cst_item t,word;
	cst_relation *word_rel;
	cst_val *words;
	const cst_val *w;
	const cst_val *ttwv;

	word_rel = utt_relation_create(u,"Word");
	ttwv = feat_val(u->features, "tokentowords_func");

	for (t=relation_head(utt_relation(u,"Token")); t; t=item_next(t))
	{
	if (ttwv)
	words = (cst_val )(val_itemfunc(ttwv))(t);
	else
	words = default_tokentowords(t);

	for (w=words; w; w=val_cdr(w))
	{
	word = item_add_daughter(t,NULL);
	if (cst_val_consp(val_car(w)))
	{ /* Has extra features */
	item_set_string(word,"name",val_string(val_car(val_car(w))));
	feat_copy_into(val_features(val_cdr(val_car(w))),
	item_feats(word));
	}
	else
	item_set_string(word,"name",val_string(val_car(w)));
	relation_append(word_rel,word);
	}
	delete_val(words);
	}

	return u;
	}

	cst_utterance default_phrasing(cst_utterance u)
	{
	cst_relation *r;
	cst_item w, p, *lp=NULL;
	const cst_val *v;
	cst_cart *phrasing_cart;

	r = utt_relation_create(u,"Phrase");
	if (feat_present(u->features,"phrasing_cart"))
	phrasing_cart = val_cart(feat_val(u->features,"phrasing_cart"));
	else
	phrasing_cart = NULL;

	for (p=NULL,w=relation_head(utt_relation(u,"Word")); w; w=item_next(w))
	{
	if (p == NULL)
	{
	p = relation_append(r,NULL);
	lp = p;
	item_set_string(p,"name","B");
	}
	item_add_daughter(p,w);
	if (phrasing_cart)
	{
	v = cart_interpret(w,phrasing_cart);
	if (cst_streq(val_string(v),"BB"))
	p = NULL;
	}
	}

	if (lp && item_prev(lp)) /* follow festival */
	item_set_string(lp,"name","BB");

	return u;
	}

	cst_utterance default_pause_insertion(cst_utterance u)
	{
	/* Add initial silences and silence at each phrase break */
	const char *silence;
	const cst_item *w;
	cst_item p, s;

	silence = val_string(feat_val(u->features,"silence"));

	/* Insert initial silence */
	s = relation_head(utt_relation(u,"Segment"));
	if (s == NULL)
	s = relation_append(utt_relation(u,"Segment"),NULL);
	else
	s = item_prepend(s,NULL);
	item_set_string(s,"name",silence);

	for (p=relation_head(utt_relation(u,"Phrase")); p; p=item_next(p))
	{
	for (w = item_last_daughter(p); w; w=item_prev(w))
	{
	s = path_to_item(w,"R:SylStructure.daughtern.daughtern.R:Segment");
	if (s)
	{
	s = item_append(s,NULL);
	item_set_string(s,"name",silence);
	break;
	}
	}
	}

	return u;
	}

	cst_utterance cart_intonation(cst_utterance u)
	{
	cst_cart accents, tones;
	cst_item *s;
	const cst_val *v;

	if (feat_present(u->features,"no_intonation_accent_model"))
	return u; /* not all languages have intonation models */

	accents = val_cart(feat_val(u->features,"int_cart_accents"));
	tones = val_cart(feat_val(u->features,"int_cart_tones"));

	for (s=relation_head(utt_relation(u,"Syllable")); s; s=item_next(s))
	{
	v = cart_interpret(s,accents);
	if (!cst_streq("NONE",val_string(v)))
	item_set_string(s,"accent",val_string(v));
	v = cart_interpret(s,tones);
	if (!cst_streq("NONE",val_string(v)))
	item_set_string(s,"endtone",val_string(v));
	DPRINTF(0,("word %s gpos %s stress %s ssyl_in %s ssyl_out %s accent %s endtone %s\n",
	ffeature_string(s,"R:SylStructure.parent.name"),
	ffeature_string(s,"R:SylStructure.parent.gpos"),
	ffeature_string(s,"stress"),
	ffeature_string(s,"ssyl_in"),
	ffeature_string(s,"ssyl_out"),
	ffeature_string(s,"accent"),
	ffeature_string(s,"endtone")));
	}

	return u;
	}

	CST_VAL_REGISTER_TYPE_NODEL(dur_stats,dur_stats)

	const dur_stat phone_dur_stat(const dur_stats ds,const char *ph)
	{
	int i;
	for (i=0; ds[i]; i++)
	if (cst_streq(ph,ds[i]->phone))
	return ds[i];

	return ds[0];
	}

	cst_utterance cart_duration(cst_utterance u)
	{
	cst_cart *dur_tree;
	cst_item *s;
	float zdur, dur_stretch, local_dur_stretch, dur;
	float end;
	dur_stats *ds;
	const dur_stat *dur_stat;

	end = 0;

	if (feat_present(u->features,"no_segment_duration_model"))
	return u; /* not all methods need segment durations */

	dur_tree = val_cart(feat_val(u->features,"dur_cart"));
	dur_stretch = get_param_float(u->features,"duration_stretch", 1.0);
	ds = val_dur_stats(feat_val(u->features,"dur_stats"));

	for (s=relation_head(utt_relation(u,"Segment")); s; s=item_next(s))
	{
	zdur = val_float(cart_interpret(s,dur_tree));
	dur_stat = phone_dur_stat(ds,item_name(s));

	local_dur_stretch = ffeature_float(s, "R:SylStructure.parent.parent."
	"R:Token.parent.local_duration_stretch");
	if (local_dur_stretch)
	local_dur_stretch *= dur_stretch;
	else
	local_dur_stretch = dur_stretch;

	dur = local_dur_stretch * ((zdur*dur_stat->stddev)+dur_stat->mean);
	DPRINTF(0,("phone %s accent %s stress %s pdur %f stretch %f mean %f std %f dur %f\n",
	item_name(s),
	ffeature_string(s,"R:SylStructure.parent.accented"),
	ffeature_string(s,"R:SylStructure.parent.stress"),
	zdur, local_dur_stretch, dur_stat->mean,
	dur_stat->stddev, dur));
	end += dur;
	item_set_float(s,"end",end);
	}
	return u;
	}

	cst_utterance default_pos_tagger(cst_utterance u)
	{
	cst_item *word;
	const cst_val *p;
	const cst_cart *tagger;

	p = get_param_val(u->features,"pos_tagger_cart",NULL);
	if (p == NULL)
	return u;
	tagger = val_cart(p);

	for (word=relation_head(utt_relation(u,"Word"));
	word; word=item_next(word))
	{
	p = cart_interpret(word,tagger);
	item_set_string(word,"pos",val_string(p));
	}

	return u;
	}

	cst_utterance default_lexical_insertion(cst_utterance u)
	{
	cst_item *word;
	cst_relation sylstructure,seg,*syl;
	cst_lexicon *lex;
	const cst_val *lex_addenda = NULL;
	const cst_val p, wp = NULL;
	char *phone_name;
	const char *stress = "0";
	const char *pos;
	cst_val *phones;
	cst_item ssword, sssyl, segitem, sylitem, *seg_in_syl;
	const cst_val *vpn;
	int dp = 0;

	lex = val_lexicon(feat_val(u->features,"lexicon"));
	if (lex->lex_addenda)
	lex_addenda = lex->lex_addenda;

	syl = utt_relation_create(u,"Syllable");
	sylstructure = utt_relation_create(u,"SylStructure");
	seg = utt_relation_create(u,"Segment");

	for (word=relation_head(utt_relation(u,"Word"));
	word; word=item_next(word))
	{
	ssword = relation_append(sylstructure,word);
	pos = ffeature_string(word,"pos");
	phones = NULL;
	wp = NULL;
	dp = 0; /* should the phones get deleted or not */

	/* printf("awb_debug word %s pos %s gpos %s\n",
	item_feat_string(word,"name"),
	pos,
	ffeature_string(word,"gpos")); */

	/* FIXME: need to make sure that textanalysis won't split
	tokens with explicit pronunciation (or that it will
	propagate such to words, then we can remove the path here) */
	if (item_feat_present(item_parent(item_as(word, "Token")), "phones"))
	{
	vpn = item_feat(item_parent(item_as(word, "Token")), "phones");
	if (cst_val_consp(vpn))
	{ /* for SAPI ?? */
	/* awb oct11: this seems wrong -- */
	/* not sure SAPI still (ever) works Oct11 */
	phones = (cst_val *) vpn;
	}
	else
	{
	dp = 1;
	if (cst_streq(val_string(vpn),
	ffeature_string(word,"p.R:Token.parent.phones")))
	phones = NULL; /* Already given these phones */
	else
	phones = val_readlist_string(val_string(vpn));
	}
	}
	else
	{
	wp = val_assoc_string(item_feat_string(word, "name"),lex_addenda);
	if (wp)
	phones = (cst_val *)val_cdr(val_cdr(wp));
	else
	{
	dp = 1;
	phones = lex_lookup(lex,item_feat_string(word,"name"),pos,
	u->features);
	}
	}

	for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p))
	{
	if (sylitem == NULL)
	{
	sylitem = relation_append(syl,NULL);
	sssyl = item_add_daughter(ssword,sylitem);
	stress = "0";
	}
	segitem = relation_append(seg,NULL);
	phone_name = cst_strdup(val_string(val_car(p)));
	if (phone_name[cst_strlen(phone_name)-1] == '1')
	{
	stress = "1";
	phone_name[cst_strlen(phone_name)-1] = '\0';
	}
	else if (phone_name[cst_strlen(phone_name)-1] == '0')
	{
	stress = "0";
	phone_name[cst_strlen(phone_name)-1] = '\0';
	}
	item_set_string(segitem,"name",phone_name);
	seg_in_syl = item_add_daughter(sssyl,segitem);
	#if 0
	printf("awb_debug ph %s\n",phone_name);
	#endif
	if ((lex->syl_boundary)(seg_in_syl,val_cdr(p)))
	{
	#if 0
	printf("awb_debug SYL\n");
	#endif
	sylitem = NULL;
	if (sssyl)
	item_set_string(sssyl,"stress",stress);
	}
	cst_free(phone_name);
	}
	if (dp)
	{
	delete_val(phones);
	phones = NULL;
	}
	}

	return u;
	}

	/* Dummy F0 modelling for phones, copied directly from us_f0_model.c */
	cst_utterance flat_prosody(cst_utterance u)
	{
	/* F0 target model */
	cst_item s,t;
	cst_relation *targ_rel;
	float mean, stddev;

	targ_rel = utt_relation_create(u,"Target");
	mean = get_param_float(u->features,"target_f0_mean", 100.0);
	mean *= get_param_float(u->features,"f0_shift", 1.0);
	stddev = get_param_float(u->features,"target_f0_stddev", 12.0);

	s=relation_head(utt_relation(u,"Segment"));
	t = relation_append(targ_rel,NULL);
	item_set_float(t,"pos",0.0);
	item_set_float(t,"f0",mean+stddev);

	s=relation_tail(utt_relation(u,"Segment"));
	t = relation_append(targ_rel,NULL);

	item_set_float(t,"pos",item_feat_float(s,"end"));
	item_set_float(t,"f0",mean-stddev);

	return u;
	}

	static cst_utterance tokentosegs(cst_utterance u)
	{
	cst_item *t;
	cst_relation seg, syl, sylstructure, word;
	cst_item sylitem, sylstructureitem, worditem, sssyl;
	cst_phoneset *ps;

	ps = val_phoneset(utt_feat_val(u, "phoneset"));
	/* Just copy tokens into the Segment relation */
	seg = utt_relation_create(u, "Segment");
	syl = utt_relation_create(u, "Syllable");
	word = utt_relation_create(u, "Word");
	sylstructure = utt_relation_create(u, "SylStructure");
	sssyl = sylitem = worditem = sylstructureitem = 0;
	for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t))
	{
	cst_item *segitem = relation_append(seg, NULL);
	char const *pname = item_feat_string(t, "name");
	char *name = cst_strdup(pname);

	if (worditem == 0)
	{
	worditem = relation_append(word,NULL);
	item_set_string(worditem, "name", "phonestring");
	sylstructureitem = relation_append(sylstructure,worditem);
	}
	if (sylitem == 0)
	{
	sylitem = relation_append(syl,NULL);
	sssyl = item_add_daughter(sylstructureitem,sylitem);
	}

	if (name[cst_strlen(name)-1] == '1')
	{
	item_set_string(sssyl,"stress","1");
	name[cst_strlen(name)-1] = '\0';
	}
	else if (name[cst_strlen(name)-1] == '0')
	{
	item_set_string(sssyl,"stress","0");
	name[cst_strlen(name)-1] = '\0';
	}

	if (cst_streq(name,"-"))
	{
	sylitem = 0; /* syllable break */
	}
	else if (phone_id(ps, name) == -1)
	{
	cst_errmsg("Phone `%s' not in phoneset\n", pname);
	cst_error();
	}
	else
	{
	item_add_daughter(sssyl,segitem);
	item_set_string(segitem, "name", name);
	}

	cst_free(name);
	}

	return u;
	}

	int default_utt_break(cst_tokenstream *ts,
	const char *token,
	cst_relation *tokens)
	{
	/* This is the default utt break functions, languages may override this */
	/* This will be ok for some latin based languages */
	const char *postpunct = item_feat_string(relation_tail(tokens), "punc");
	const char *ltoken = item_name(relation_tail(tokens));

	if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n'))
	/* contains two new lines */
	return TRUE;
	/* Well, this is a little specific isn't it. */
	else if (((cst_streq(ltoken,"Yahoo")) \|\|
	(cst_streq(ltoken,"YAHOO")) \|\|
	(cst_streq(ltoken,"yahoo"))) &&
	strchr(postpunct,'!') &&
	strchr("abcdefghijklmnopqrstuvwxyz",token[0]))
	return FALSE;
	else if (strchr(postpunct,':') \|\|
	strchr(postpunct,'?') \|\|
	strchr(postpunct,'!'))
	return TRUE;
	else if (strchr(postpunct,'.') &&
	(cst_strlen(ts->whitespace) > 1) &&
	strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]))
	return TRUE;
	else if (strchr(postpunct,'.') &&
	/* next word starts with a capital */
	strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]) &&
	/* last word isn't an abbreviation */
	!(strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[cst_strlen(ltoken)-1])\|\|
	((cst_strlen(ltoken) < 4) &&
	strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[0]))))
	return TRUE;
	else
	return FALSE;
	}