src/cg/cst_cg.c - third_party/flite - Git at Google

 /*************************************************************************/
 /*                                                                       */
 /*                  Language Technologies Institute                      */
 /*                     Carnegie Mellon University                        */
 /*                         Copyright (c) 2007                            */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*             Authors:  Alan W Black (awb@cs.cmu.edu)                   */
 /*                Date:  November 2007                                   */
 /*************************************************************************/
 /*                                                                       */
 /*  Implementation of Clustergen, Statistical Parameter Synthesizer in   */
 /*  Flite                                                                */
 /*                                                                       */
 /*  A statistical corpus based synthesizer.                              */
 /*  See Black, A. (2006), CLUSTERGEN: A Statistical Parametric           */
 /*  Synthesizer using Trajectory Modeling", Interspeech 2006 - ICSLP,    */
 /*  Pittsburgh, PA.                                                      */
 /*  http://www.cs.cmu.edu/~awb/papers/is2006/IS061394.PDF                */
 /*                                                                       */
 /*  Uses MLSA for resynthesis and MLPG for smoothing                     */
 /*  mlsa and mlpg come from Festvox's VC code (which came in turn        */
 /*  came from NITECH's HTS                                               */
 /*                                                                       */
 /*************************************************************************/

 #include "cst_cg.h"
 #include "cst_spamf0.h"
 #include "cst_hrg.h"
 #include "cst_utt_utils.h"
 #include "cst_audio.h"

 CST_VAL_REGISTER_TYPE(cg_db,cst_cg_db)

 static cst_utterance *cg_make_hmmstates(cst_utterance *utt);
 static cst_utterance *cg_make_params(cst_utterance *utt);
 static cst_utterance *cg_predict_params(cst_utterance *utt);
 static cst_utterance *cg_resynth(cst_utterance *utt);

 void delete_cg_db(cst_cg_db *db)
 {
     int i,j;

     if (db->freeable == 0)
         return;  /* its in the data segment, so not freeable */

     /* Woo Hoo!  We're gonna free this garbage with a big mallet */
     /* In spite of what the const qualifiers say ... */
     cst_free((void *)db->name);

     for (i=0; db->types && db->types[i]; i++)
         cst_free((void *)db->types[i]);
     cst_free((void *)db->types);

     for (i=0; db->f0_trees && db->f0_trees[i]; i++)
         delete_cart((cst_cart *)(void *)db->f0_trees[i]);
     cst_free((void *)db->f0_trees);

     for (j=0; j<db->num_param_models; j++)
     {
         for (i=0; db->param_trees[j] && db->param_trees[j][i]; i++)
             delete_cart((cst_cart *)(void *)db->param_trees[j][i]);
         cst_free((void *)db->param_trees[j]);
     }
     cst_free((void *)db->param_trees);

     if (db->spamf0)
     {
         delete_cart((cst_cart *)(void *)db->spamf0_accent_tree);
         delete_cart((cst_cart *)(void *)db->spamf0_phrase_tree);
         for (i=0; i< db->num_frames_spamf0_accent; i++)
             cst_free((void *)db->spamf0_accent_vectors[i]);
         cst_free((void *)db->spamf0_accent_vectors);
     }

     for (j=0; j<db->num_param_models; j++)
     {
         for (i=0; i<db->num_frames[j]; i++)
             cst_free((void *)db->model_vectors[j][i]);
         cst_free((void *)db->model_vectors[j]);
     }
     cst_free(db->num_channels);
     cst_free(db->num_frames);
     cst_free((void *)db->model_vectors);

     cst_free((void *)db->model_min);
     cst_free((void *)db->model_range);

     for (j = 0; j<db->num_dur_models; j++)
     {
         for (i=0; db->dur_stats[j] && db->dur_stats[j][i]; i++)
         {
             cst_free((void *)db->dur_stats[j][i]->phone);
             cst_free((void *)db->dur_stats[j][i]);
         }
         cst_free((void *)db->dur_stats[j]);
         delete_cart((void *)db->dur_cart[j]);
     }
     cst_free((void *)db->dur_stats);
     cst_free((void *)db->dur_cart);

     for (i=0; db->phone_states && db->phone_states[i]; i++)
     {
         for (j=0; db->phone_states[i][j]; j++)
             cst_free((void *)db->phone_states[i][j]);
         cst_free((void *)db->phone_states[i]);
     }
     cst_free((void *)db->phone_states);

     cst_free((void *)db->dynwin);

     for (i=0; i<db->ME_num; i++)
         cst_free((void *)db->me_h[i]);
     cst_free((void *)db->me_h);

     cst_free((void *)db);
 }

 /* */
 cst_utterance *cg_synth(cst_utterance *utt)
 {
     cst_cg_db *cg_db;
     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));

     cg_make_hmmstates(utt);
     cg_make_params(utt);
     cg_predict_params(utt);
     if (cg_db->spamf0)
     {
 	cst_spamf0(utt);
     }
     cg_resynth(utt);

     return utt;
 }

 static float cg_state_duration(cst_item *s, cst_cg_db *cg_db)
 {
     float zdur, dur;
     const char *n;
     int i, x, dm;

     for (dm=0,zdur=0.0; dm < cg_db->num_dur_models; dm++)
         zdur += val_float(cart_interpret(s,cg_db->dur_cart[dm]));
     zdur /= dm;  /* get average zdur prediction from all dur models */
     n = item_feat_string(s,"name");

     /* Note we only use the dur stats from the first model, that is */
     /* correct, but wouldn't be if the dur tree was trained on different */
     /* data */
     for (x=i=0; cg_db->dur_stats[0][i]; i++)
     {
         if (cst_streq(cg_db->dur_stats[0][i]->phone,n))
         {
             x=i;
             break;
         }
     }
     if (!cg_db->dur_stats[0][i])  /* unknown type name */
         x = 0;

     dur = (zdur*cg_db->dur_stats[0][x]->stddev)+cg_db->dur_stats[0][x]->mean;

     /*    dur = 1.2 * (float)exp((float)dur); */

     return dur;
 }

 static cst_utterance *cg_make_hmmstates(cst_utterance *utt)
 {
     /* Build HMM state structure below the segment structure */
     cst_cg_db *cg_db;
     cst_relation *hmmstate, *segstate;
     cst_item *seg, *s, *ss;
     const char *segname;
     int sp,p;

     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
     hmmstate = utt_relation_create(utt,"HMMstate");
     segstate = utt_relation_create(utt,"segstate");

     for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg))
     {
         ss = relation_append(segstate,seg);
         segname = item_feat_string(seg,"name");
         for (p=0; cg_db->phone_states[p]; p++)
             if (cst_streq(segname,cg_db->phone_states[p][0]))
                 break;
         if (cg_db->phone_states[p] == NULL)
             p = 0;  /* unknown phoneme */
         for (sp=1; cg_db->phone_states[p][sp]; sp++)
         {
             s = relation_append(hmmstate,NULL);
             item_add_daughter(ss,s);
             item_set_string(s,"name",cg_db->phone_states[p][sp]);
             item_set_int(s,"statepos",sp);
         }
     }

     return utt;
 }

 static cst_utterance *cg_make_params(cst_utterance *utt)
 {
     /* puts in the frame items */
     /* historically called "mcep" but can actually be any random vectors */
     cst_cg_db *cg_db;
     cst_relation *mcep, *mcep_link;
     cst_item *s, *mcep_parent, *mcep_frame;
     int num_frames;
     float start, end;
     float dur_stretch, tok_stretch, rdur;

     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
     mcep = utt_relation_create(utt,"mcep");
     mcep_link = utt_relation_create(utt,"mcep_link");
     end = 0.0;
     num_frames = 0;
     dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0);

     for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s))
     {
         start = end;
         tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch");
         if (tok_stretch == 0)
             tok_stretch = 1.0;
         rdur = tok_stretch*dur_stretch*cg_state_duration(s,cg_db);
         /* Guarantee duration to be alt least one frame */
         if (rdur < cg_db->frame_advance)
             end = start + cg_db->frame_advance;
         else
             end = start + rdur;
         item_set_float(s,"end",end);
         mcep_parent = relation_append(mcep_link, s);
         for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ )
         {
             mcep_frame = relation_append(mcep,NULL);
             item_add_daughter(mcep_parent,mcep_frame);
             item_set_int(mcep_frame,"frame_number",num_frames);
             item_set(mcep_frame,"name",item_feat(mcep_parent,"name"));
         }
     }

     /* Copy duration up onto Segment relation */
     for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s))
         item_set(s,"end",ffeature(s,"R:segstate.daughtern.end"));

     utt_set_feat_int(utt,"param_track_num_frames",num_frames);

     return utt;
 }

 #if CG_OLD
 static int voiced_frame(cst_item *m)
 {
     const char *ph_vc;
     const char *ph_cvox;

     ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
     ph_cvox = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_cvox");

     if (cst_streq("-",ph_vc) &&
         cst_streq("-",ph_cvox))
         return 0; /* unvoiced */
     else
         return 1; /* voiced */
 }
 #endif

 static int voiced_frame(cst_item *m)
 {
     const char *ph_vc;
     const char *ph_name;

     ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
     ph_name = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.name");

     if (cst_streq(ph_name,"pau"))
         return 0; /* unvoiced */
     else if (cst_streq("+",ph_vc))
         return 1; /* voiced */
     else if (item_feat_float(m,"voicing") > 0.5)
         /* Even though the range is 0-10, I *do* mean 0.5 */
         return 1; /* voiced */
     else
         return 0; /* unvoiced */
 }

 static float catmull_rom_spline(float p,float p0,float p1,float p2,float p3)
 /* http://www.mvps.org/directx/articles/ */
 {
     float q;

     q = ( 0.5 *
           ( ( 2.0 * p1 ) +
             ( p * (-p0 + p2) ) +
             ( (p*p) * (((2.0 * p0) - (5.0 * p1)) +
                        ((4.0 * p2) - p3))) +
             ( (p*p*p) * (-p0 +
                          ((3.0 * p1) - (3.0 * p2)) +
                          p3))));
      /*    (set! q (* 0.5 (+ (* 2 p1)
            (* (+ (* -1 p0) p2) p)
             (* (+ (- (* 2 p0) (* 5 p1)) (- (* 4 p2) p3)) (* p p))
             (* (+ (* -1 p0) (- (* 3 p1) (* 3 p2)) p3) (* p p p)))))
      */
     return q;
 }

 static void cg_F0_interpolate_spline(cst_utterance *utt,
                                      cst_track *param_track)
 {
     float start_f0, mid_f0, end_f0;
     int start_index, end_index, mid_index;
     int nsi, nei, nmi;  /* next syllable indices */
     float nmid_f0, pmid_f0;
     cst_item *syl;
     int i;
     float m;

     start_f0 = mid_f0 = end_f0 = -1.0;

     for (syl=utt_rel_head(utt,"Syllable"); syl; syl=item_next(syl))
     {
         start_index = ffeature_int(syl,"R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
         end_index = ffeature_int(syl,"R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
         mid_index = (int)((start_index + end_index)/2.0);

         start_f0 = param_track->frames[start_index][0];
         if (end_f0 > 0.0)
             start_f0 = end_f0;  /* not first time through */
         if (mid_f0 < 0.0)
             pmid_f0 = start_f0;  /* first time through */
         else
             pmid_f0 = mid_f0;
         mid_f0 =  param_track->frames[mid_index][0];
         if (item_next(syl)) /* not last syllable */
             end_f0 = (param_track->frames[end_index-1][0]+
                       param_track->frames[end_index][0])/2.0;
         else
             end_f0 = param_track->frames[end_index-1][0];
         nmid_f0=end_f0; /* in case there is no next syl */

         if (item_next(syl))
         {
             nsi = ffeature_int(syl,"n.R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
             nei = ffeature_int(syl,"n.R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
             nmi = (int)((nsi + nei)/2.0);
             nmid_f0 = param_track->frames[nmi][0];
         }
         /* start to mid syl */
         m = 1.0 / (mid_index - start_index);
         for (i=0; ((start_index+i)<mid_index); i++)
             param_track->frames[start_index+i][0] =
                  catmull_rom_spline(i*m,pmid_f0,start_f0,mid_f0,end_f0);

         /* mid syl to end */
         m = 1.0 / (end_index - mid_index);
         for (i=0; ((mid_index+i)<end_index); i++)
             param_track->frames[mid_index+i][0] =
                 catmull_rom_spline(i*m,start_f0,mid_f0,end_f0,nmid_f0);
     }

     return;
 }

 #if 0
 static void cg_smooth_F0_naive(cst_track *param_track)
 {
     float l,s;
     int i,c;

     l = 0.0;
     for (i=0; i<param_track->num_frames-1; i++)
     {
         c = 0; s = 0;
         if (l > 0.0)
         {
             c++; s+=l;
         }
         if (param_track->frames[i+1][0] > 0.0)
         {
             c++; s+=param_track->frames[i+1][0];
         }
         l = param_track->frames[i][0];
         if (param_track->frames[i][0] > 0.0)
         {
             c++; s+=param_track->frames[i][0];
             param_track->frames[i][0] = s/c;
         }
     }

     return;
 }
 #endif

 static void cg_smooth_F0(cst_utterance *utt,
                          cst_cg_db *cg_db,
                          cst_track *param_track)
 {
     /* Smooth F0 and mark unvoice frames as 0.0 */
     cst_item *mcep;
     int i;
     float mean, stddev;

     /* cg_smooth_F0_naive(param_track); */

     cg_F0_interpolate_spline(utt,param_track);

     mean = get_param_float(utt->features,"int_f0_target_mean", cg_db->f0_mean);
     mean *= get_param_float(utt->features,"f0_shift", 1.0);
     stddev =
         get_param_float(utt->features,"int_f0_target_stddev", cg_db->f0_stddev);

     for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
     {
         if (voiced_frame(mcep))
         {
             /* scale the F0 -- which normally wont change it at all */
             param_track->frames[i][0] =
                 (((param_track->frames[i][0]-cg_db->f0_mean)/cg_db->f0_stddev)
                  *stddev)+mean;
             /* Some safety checks */
             if (param_track->frames[i][0] < 50)
                 param_track->frames[i][0] = 50;
             if (param_track->frames[i][0] > 700)
                 param_track->frames[i][0] = 700;
         }
         else /* Unvoice it */
             param_track->frames[i][0] = 0.0;
     }

     return;
 }

 static cst_utterance *cg_predict_params(cst_utterance *utt)
 {
     cst_cg_db *cg_db;
     cst_track *param_track;
     cst_track *str_track = NULL;
     cst_item *mcep;
     const cst_cart *mcep_tree, *f0_tree;
     int i,j,f,p,o,pm;
     const char *mname;
     float f0_val;
     float local_gain, voicing;
     int fff;
     int extra_feats = 0;

     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
     param_track = new_track();
     if (cg_db->do_mlpg) /* which should be the default */
         fff = 1;  /* copy details with stddevs */
     else
         fff = 2;  /* copy details without stddevs */

     extra_feats = 1;  /* voicing */
     if (cg_db->mixed_excitation)
     {
         extra_feats += 5;
         str_track = new_track();
         cst_track_resize(str_track,
                          utt_feat_int(utt,"param_track_num_frames"),
                          5);
     }

     cst_track_resize(param_track,
                      utt_feat_int(utt,"param_track_num_frames"),
                      (cg_db->num_channels[0]/fff)-
                        (2 * extra_feats));/* no voicing or str */
     f = 0;
     for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
     {
         mname = item_feat_string(mcep,"name");
         local_gain = ffeature_float(mcep,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");
         if (local_gain == 0.0) local_gain = 1.0;
         for (p=0; cg_db->types[p]; p++)
             if (cst_streq(mname,cg_db->types[p]))
                 break;
         if (cg_db->types[p] == NULL)
             p=0; /* if there isn't a matching tree, use the first one */

         /* Predict F0 */
         f0_tree = cg_db->f0_trees[p];
         f0_val = val_float(cart_interpret(mcep,f0_tree));
         param_track->frames[i][0] = f0_val;
         /* what about stddev ? */

         /* We only have multiple models now, but the default is one model */
         /* Predict spectral coeffs */
         voicing = 0.0;
         for (pm=0; pm<cg_db->num_param_models; pm++)
         {
             mcep_tree = cg_db->param_trees[pm][p];
             f = val_int(cart_interpret(mcep,mcep_tree));
             /* If there is one model this will be fine, if there are */
             /* multiple models this will be the nth model */
             item_set_int(mcep,"clustergen_param_frame",f);

             /* Old code used to average in param[0] with F0 too (???) */

             for (j=2; j<param_track->num_channels; j++)
             {
                 if (pm == 0) param_track->frames[i][j] = 0.0;
                 param_track->frames[i][j] +=
                     CG_MODEL_VECTOR(cg_db,model_vectors[pm],f,(j)*fff)/
                     (float)cg_db->num_param_models;
             }

             if (cg_db->mixed_excitation)
             {
                 o = j;
                 for (j=0; j<5; j++)
                 {
                     if (pm == 0) str_track->frames[i][j] = 0.0;
                     str_track->frames[i][j] +=
                         CG_MODEL_VECTOR(cg_db,model_vectors[pm],f,
                                         (o+(2*j))*fff) /
                         (float)cg_db->num_param_models;
                 }
             }

             /* last coefficient is average voicing for cluster */
             voicing /= (float)(pm+1);
             voicing +=
                 CG_MODEL_VECTOR(cg_db,model_vectors[pm],f,
                                 cg_db->num_channels[pm]-2) /
                 (float)(pm+1);
         }
         item_set_float(mcep,"voicing",voicing);
         /* Apply local gain to c0 */
         param_track->frames[i][2] *= local_gain;

         param_track->times[i] = i * cg_db->frame_advance;
     }

     cg_smooth_F0(utt,cg_db,param_track);

     utt_set_feat(utt,"param_track",track_val(param_track));
     if (cg_db->mixed_excitation)
         utt_set_feat(utt,"str_track",track_val(str_track));

     return utt;
 }

 static cst_utterance *cg_resynth(cst_utterance *utt)
 {
     cst_cg_db *cg_db;
     cst_wave *w;
     cst_track *param_track;
     cst_track *str_track = NULL;
     cst_track *smoothed_track;
     const cst_val *streaming_info_val;
     cst_audio_streaming_info *asi = NULL;

     streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
     if (streaming_info_val)
     {
         asi = val_audio_streaming_info(streaming_info_val);
         asi->utt = utt;
     }

     cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
     param_track = val_track(utt_feat_val(utt,"param_track"));
     if (cg_db->mixed_excitation)
         str_track = val_track(utt_feat_val(utt,"str_track"));

     if (cg_db->do_mlpg)
     {
         smoothed_track = mlpg(param_track, cg_db);
         w = mlsa_resynthesis(smoothed_track,str_track,cg_db,asi);
         delete_track(smoothed_track);
     }
     else
         w=mlsa_resynthesis(param_track,str_track,cg_db,asi);

     if (w == NULL)
     {
         /* Synthesis Failed, probably because it was interrupted */
         utt_set_feat_int(utt,"Interrupted",1);
         w = new_wave();
     }

 #if 0
     /* Apply local gain */
     for (i=0,tok=utt_rel_head(utt,"Token"); tok; i++,tok=item_next(tok))
     {
         if (item_feat_present(tok,"local_gain"))
             local_gain = item_feat_float(tokget_param_fffeature_float(tok,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");

     }
 #endif


     utt_set_wave(utt,w);

     return utt;
 }
	/*************************************************************************/
	/* */
	/* Language Technologies Institute */
	/* Carnegie Mellon University */
	/* Copyright (c) 2007 */
	/* All Rights Reserved. */
	/* */
	/* Permission is hereby granted, free of charge, to use and distribute */
	/* this software and its documentation without restriction, including */
	/* without limitation the rights to use, copy, modify, merge, publish, */
	/* distribute, sublicense, and/or sell copies of this work, and to */
	/* permit persons to whom this work is furnished to do so, subject to */
	/* the following conditions: */
	/* 1. The code must retain the above copyright notice, this list of */
	/* conditions and the following disclaimer. */
	/* 2. Any modifications must be clearly marked as such. */
	/* 3. Original authors' names are not deleted. */
	/* 4. The authors' names are not used to endorse or promote products */
	/* derived from this software without specific prior written */
	/* permission. */
	/* */
	/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
	/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
	/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
	/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
	/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
	/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
	/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
	/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
	/* THIS SOFTWARE. */
	/* */
	/*************************************************************************/
	/* Authors: Alan W Black (awb@cs.cmu.edu) */
	/* Date: November 2007 */
	/*************************************************************************/
	/* */
	/* Implementation of Clustergen, Statistical Parameter Synthesizer in */
	/* Flite */
	/* */
	/* A statistical corpus based synthesizer. */
	/* See Black, A. (2006), CLUSTERGEN: A Statistical Parametric */
	/* Synthesizer using Trajectory Modeling", Interspeech 2006 - ICSLP, */
	/* Pittsburgh, PA. */
	/* http://www.cs.cmu.edu/~awb/papers/is2006/IS061394.PDF */
	/* */
	/* Uses MLSA for resynthesis and MLPG for smoothing */
	/* mlsa and mlpg come from Festvox's VC code (which came in turn */
	/* came from NITECH's HTS */
	/* */
	/*************************************************************************/

	#include "cst_cg.h"
	#include "cst_spamf0.h"
	#include "cst_hrg.h"
	#include "cst_utt_utils.h"
	#include "cst_audio.h"

	CST_VAL_REGISTER_TYPE(cg_db,cst_cg_db)

	static cst_utterance cg_make_hmmstates(cst_utterance utt);
	static cst_utterance cg_make_params(cst_utterance utt);
	static cst_utterance cg_predict_params(cst_utterance utt);
	static cst_utterance cg_resynth(cst_utterance utt);

	void delete_cg_db(cst_cg_db *db)
	{
	int i,j;

	if (db->freeable == 0)
	return; /* its in the data segment, so not freeable */

	/* Woo Hoo! We're gonna free this garbage with a big mallet */
	/* In spite of what the const qualifiers say ... */
	cst_free((void *)db->name);

	for (i=0; db->types && db->types[i]; i++)
	cst_free((void *)db->types[i]);
	cst_free((void *)db->types);

	for (i=0; db->f0_trees && db->f0_trees[i]; i++)
	delete_cart((cst_cart )(void )db->f0_trees[i]);
	cst_free((void *)db->f0_trees);

	for (j=0; j<db->num_param_models; j++)
	{
	for (i=0; db->param_trees[j] && db->param_trees[j][i]; i++)
	delete_cart((cst_cart )(void )db->param_trees[j][i]);
	cst_free((void *)db->param_trees[j]);
	}
	cst_free((void *)db->param_trees);

	if (db->spamf0)
	{
	delete_cart((cst_cart )(void )db->spamf0_accent_tree);
	delete_cart((cst_cart )(void )db->spamf0_phrase_tree);
	for (i=0; i< db->num_frames_spamf0_accent; i++)
	cst_free((void *)db->spamf0_accent_vectors[i]);
	cst_free((void *)db->spamf0_accent_vectors);
	}

	for (j=0; j<db->num_param_models; j++)
	{
	for (i=0; i<db->num_frames[j]; i++)
	cst_free((void *)db->model_vectors[j][i]);
	cst_free((void *)db->model_vectors[j]);
	}
	cst_free(db->num_channels);
	cst_free(db->num_frames);
	cst_free((void *)db->model_vectors);

	cst_free((void *)db->model_min);
	cst_free((void *)db->model_range);

	for (j = 0; j<db->num_dur_models; j++)
	{
	for (i=0; db->dur_stats[j] && db->dur_stats[j][i]; i++)
	{
	cst_free((void *)db->dur_stats[j][i]->phone);
	cst_free((void *)db->dur_stats[j][i]);
	}
	cst_free((void *)db->dur_stats[j]);
	delete_cart((void *)db->dur_cart[j]);
	}
	cst_free((void *)db->dur_stats);
	cst_free((void *)db->dur_cart);

	for (i=0; db->phone_states && db->phone_states[i]; i++)
	{
	for (j=0; db->phone_states[i][j]; j++)
	cst_free((void *)db->phone_states[i][j]);
	cst_free((void *)db->phone_states[i]);
	}
	cst_free((void *)db->phone_states);

	cst_free((void *)db->dynwin);

	for (i=0; i<db->ME_num; i++)
	cst_free((void *)db->me_h[i]);
	cst_free((void *)db->me_h);

	cst_free((void *)db);
	}

	/* */
	cst_utterance cg_synth(cst_utterance utt)
	{
	cst_cg_db *cg_db;
	cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));

	cg_make_hmmstates(utt);
	cg_make_params(utt);
	cg_predict_params(utt);
	if (cg_db->spamf0)
	{
	cst_spamf0(utt);
	}
	cg_resynth(utt);

	return utt;
	}

	static float cg_state_duration(cst_item s, cst_cg_db cg_db)
	{
	float zdur, dur;
	const char *n;
	int i, x, dm;

	for (dm=0,zdur=0.0; dm < cg_db->num_dur_models; dm++)
	zdur += val_float(cart_interpret(s,cg_db->dur_cart[dm]));
	zdur /= dm; /* get average zdur prediction from all dur models */
	n = item_feat_string(s,"name");

	/* Note we only use the dur stats from the first model, that is */
	/* correct, but wouldn't be if the dur tree was trained on different */
	/* data */
	for (x=i=0; cg_db->dur_stats[0][i]; i++)
	{
	if (cst_streq(cg_db->dur_stats[0][i]->phone,n))
	{
	x=i;
	break;
	}
	}
	if (!cg_db->dur_stats[0][i]) /* unknown type name */
	x = 0;

	dur = (zdur*cg_db->dur_stats[0][x]->stddev)+cg_db->dur_stats[0][x]->mean;

	/* dur = 1.2 * (float)exp((float)dur); */

	return dur;
	}

	static cst_utterance cg_make_hmmstates(cst_utterance utt)
	{
	/* Build HMM state structure below the segment structure */
	cst_cg_db *cg_db;
	cst_relation hmmstate, segstate;
	cst_item seg, s, *ss;
	const char *segname;
	int sp,p;

	cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
	hmmstate = utt_relation_create(utt,"HMMstate");
	segstate = utt_relation_create(utt,"segstate");

	for (seg = utt_rel_head(utt,"Segment"); seg; seg=item_next(seg))
	{
	ss = relation_append(segstate,seg);
	segname = item_feat_string(seg,"name");
	for (p=0; cg_db->phone_states[p]; p++)
	if (cst_streq(segname,cg_db->phone_states[p][0]))
	break;
	if (cg_db->phone_states[p] == NULL)
	p = 0; /* unknown phoneme */
	for (sp=1; cg_db->phone_states[p][sp]; sp++)
	{
	s = relation_append(hmmstate,NULL);
	item_add_daughter(ss,s);
	item_set_string(s,"name",cg_db->phone_states[p][sp]);
	item_set_int(s,"statepos",sp);
	}
	}

	return utt;
	}

	static cst_utterance cg_make_params(cst_utterance utt)
	{
	/* puts in the frame items */
	/* historically called "mcep" but can actually be any random vectors */
	cst_cg_db *cg_db;
	cst_relation mcep, mcep_link;
	cst_item s, mcep_parent, *mcep_frame;
	int num_frames;
	float start, end;
	float dur_stretch, tok_stretch, rdur;

	cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
	mcep = utt_relation_create(utt,"mcep");
	mcep_link = utt_relation_create(utt,"mcep_link");
	end = 0.0;
	num_frames = 0;
	dur_stretch = get_param_float(utt->features,"duration_stretch", 1.0);

	for (s = utt_rel_head(utt,"HMMstate"); s; s=item_next(s))
	{
	start = end;
	tok_stretch = ffeature_float(s,"R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_duration_stretch");
	if (tok_stretch == 0)
	tok_stretch = 1.0;
	rdur = tok_stretchdur_stretchcg_state_duration(s,cg_db);
	/* Guarantee duration to be alt least one frame */
	if (rdur < cg_db->frame_advance)
	end = start + cg_db->frame_advance;
	else
	end = start + rdur;
	item_set_float(s,"end",end);
	mcep_parent = relation_append(mcep_link, s);
	for ( ; (num_frames * cg_db->frame_advance) <= end; num_frames++ )
	{
	mcep_frame = relation_append(mcep,NULL);
	item_add_daughter(mcep_parent,mcep_frame);
	item_set_int(mcep_frame,"frame_number",num_frames);
	item_set(mcep_frame,"name",item_feat(mcep_parent,"name"));
	}
	}

	/* Copy duration up onto Segment relation */
	for (s = utt_rel_head(utt,"Segment"); s; s=item_next(s))
	item_set(s,"end",ffeature(s,"R:segstate.daughtern.end"));

	utt_set_feat_int(utt,"param_track_num_frames",num_frames);

	return utt;
	}

	#if CG_OLD
	static int voiced_frame(cst_item *m)
	{
	const char *ph_vc;
	const char *ph_cvox;

	ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
	ph_cvox = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_cvox");

	if (cst_streq("-",ph_vc) &&
	cst_streq("-",ph_cvox))
	return 0; /* unvoiced */
	else
	return 1; /* voiced */
	}
	#endif

	static int voiced_frame(cst_item *m)
	{
	const char *ph_vc;
	const char *ph_name;

	ph_vc = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.ph_vc");
	ph_name = ffeature_string(m,"R:mcep_link.parent.R:segstate.parent.name");

	if (cst_streq(ph_name,"pau"))
	return 0; /* unvoiced */
	else if (cst_streq("+",ph_vc))
	return 1; /* voiced */
	else if (item_feat_float(m,"voicing") > 0.5)
	/* Even though the range is 0-10, I do mean 0.5 */
	return 1; /* voiced */
	else
	return 0; /* unvoiced */
	}

	static float catmull_rom_spline(float p,float p0,float p1,float p2,float p3)
	/* http://www.mvps.org/directx/articles/ */
	{
	float q;

	q = ( 0.5 *
	( ( 2.0 * p1 ) +
	( p * (-p0 + p2) ) +
	( (pp) (((2.0 * p0) - (5.0 * p1)) +
	((4.0 * p2) - p3))) +
	( (ppp) * (-p0 +
	((3.0 * p1) - (3.0 * p2)) +
	p3))));
	/* (set! q (* 0.5 (+ (* 2 p1)
	(* (+ (* -1 p0) p2) p)
	(* (+ (- (* 2 p0) (* 5 p1)) (- (* 4 p2) p3)) (* p p))
	(* (+ (* -1 p0) (- (* 3 p1) (* 3 p2)) p3) (* p p p)))))
	*/
	return q;
	}

	static void cg_F0_interpolate_spline(cst_utterance *utt,
	cst_track *param_track)
	{
	float start_f0, mid_f0, end_f0;
	int start_index, end_index, mid_index;
	int nsi, nei, nmi; /* next syllable indices */
	float nmid_f0, pmid_f0;
	cst_item *syl;
	int i;
	float m;

	start_f0 = mid_f0 = end_f0 = -1.0;

	for (syl=utt_rel_head(utt,"Syllable"); syl; syl=item_next(syl))
	{
	start_index = ffeature_int(syl,"R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
	end_index = ffeature_int(syl,"R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
	mid_index = (int)((start_index + end_index)/2.0);

	start_f0 = param_track->frames[start_index][0];
	if (end_f0 > 0.0)
	start_f0 = end_f0; /* not first time through */
	if (mid_f0 < 0.0)
	pmid_f0 = start_f0; /* first time through */
	else
	pmid_f0 = mid_f0;
	mid_f0 = param_track->frames[mid_index][0];
	if (item_next(syl)) /* not last syllable */
	end_f0 = (param_track->frames[end_index-1][0]+
	param_track->frames[end_index][0])/2.0;
	else
	end_f0 = param_track->frames[end_index-1][0];
	nmid_f0=end_f0; /* in case there is no next syl */

	if (item_next(syl))
	{
	nsi = ffeature_int(syl,"n.R:SylStructure.daughter1.R:segstate.daughter1.R:mcep_link.daughter1.frame_number");
	nei = ffeature_int(syl,"n.R:SylStructure.daughtern.R:segstate.daughtern.R:mcep_link.daughtern.frame_number");
	nmi = (int)((nsi + nei)/2.0);
	nmid_f0 = param_track->frames[nmi][0];
	}
	/* start to mid syl */
	m = 1.0 / (mid_index - start_index);
	for (i=0; ((start_index+i)<mid_index); i++)
	param_track->frames[start_index+i][0] =
	catmull_rom_spline(i*m,pmid_f0,start_f0,mid_f0,end_f0);

	/* mid syl to end */
	m = 1.0 / (end_index - mid_index);
	for (i=0; ((mid_index+i)<end_index); i++)
	param_track->frames[mid_index+i][0] =
	catmull_rom_spline(i*m,start_f0,mid_f0,end_f0,nmid_f0);
	}

	return;
	}

	#if 0
	static void cg_smooth_F0_naive(cst_track *param_track)
	{
	float l,s;
	int i,c;

	l = 0.0;
	for (i=0; i<param_track->num_frames-1; i++)
	{
	c = 0; s = 0;
	if (l > 0.0)
	{
	c++; s+=l;
	}
	if (param_track->frames[i+1][0] > 0.0)
	{
	c++; s+=param_track->frames[i+1][0];
	}
	l = param_track->frames[i][0];
	if (param_track->frames[i][0] > 0.0)
	{
	c++; s+=param_track->frames[i][0];
	param_track->frames[i][0] = s/c;
	}
	}

	return;
	}
	#endif

	static void cg_smooth_F0(cst_utterance *utt,
	cst_cg_db *cg_db,
	cst_track *param_track)
	{
	/* Smooth F0 and mark unvoice frames as 0.0 */
	cst_item *mcep;
	int i;
	float mean, stddev;

	/* cg_smooth_F0_naive(param_track); */

	cg_F0_interpolate_spline(utt,param_track);

	mean = get_param_float(utt->features,"int_f0_target_mean", cg_db->f0_mean);
	mean *= get_param_float(utt->features,"f0_shift", 1.0);
	stddev =
	get_param_float(utt->features,"int_f0_target_stddev", cg_db->f0_stddev);

	for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
	{
	if (voiced_frame(mcep))
	{
	/* scale the F0 -- which normally wont change it at all */
	param_track->frames[i][0] =
	(((param_track->frames[i][0]-cg_db->f0_mean)/cg_db->f0_stddev)
	*stddev)+mean;
	/* Some safety checks */
	if (param_track->frames[i][0] < 50)
	param_track->frames[i][0] = 50;
	if (param_track->frames[i][0] > 700)
	param_track->frames[i][0] = 700;
	}
	else /* Unvoice it */
	param_track->frames[i][0] = 0.0;
	}

	return;
	}

	static cst_utterance cg_predict_params(cst_utterance utt)
	{
	cst_cg_db *cg_db;
	cst_track *param_track;
	cst_track *str_track = NULL;
	cst_item *mcep;
	const cst_cart mcep_tree, f0_tree;
	int i,j,f,p,o,pm;
	const char *mname;
	float f0_val;
	float local_gain, voicing;
	int fff;
	int extra_feats = 0;

	cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
	param_track = new_track();
	if (cg_db->do_mlpg) /* which should be the default */
	fff = 1; /* copy details with stddevs */
	else
	fff = 2; /* copy details without stddevs */

	extra_feats = 1; /* voicing */
	if (cg_db->mixed_excitation)
	{
	extra_feats += 5;
	str_track = new_track();
	cst_track_resize(str_track,
	utt_feat_int(utt,"param_track_num_frames"),
	5);
	}

	cst_track_resize(param_track,
	utt_feat_int(utt,"param_track_num_frames"),
	(cg_db->num_channels[0]/fff)-
	(2 * extra_feats));/* no voicing or str */
	f = 0;
	for (i=0,mcep=utt_rel_head(utt,"mcep"); mcep; i++,mcep=item_next(mcep))
	{
	mname = item_feat_string(mcep,"name");
	local_gain = ffeature_float(mcep,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");
	if (local_gain == 0.0) local_gain = 1.0;
	for (p=0; cg_db->types[p]; p++)
	if (cst_streq(mname,cg_db->types[p]))
	break;
	if (cg_db->types[p] == NULL)
	p=0; /* if there isn't a matching tree, use the first one */

	/* Predict F0 */
	f0_tree = cg_db->f0_trees[p];
	f0_val = val_float(cart_interpret(mcep,f0_tree));
	param_track->frames[i][0] = f0_val;
	/* what about stddev ? */

	/* We only have multiple models now, but the default is one model */
	/* Predict spectral coeffs */
	voicing = 0.0;
	for (pm=0; pm<cg_db->num_param_models; pm++)
	{
	mcep_tree = cg_db->param_trees[pm][p];
	f = val_int(cart_interpret(mcep,mcep_tree));
	/* If there is one model this will be fine, if there are */
	/* multiple models this will be the nth model */
	item_set_int(mcep,"clustergen_param_frame",f);

	/* Old code used to average in param[0] with F0 too (???) */

	for (j=2; j<param_track->num_channels; j++)
	{
	if (pm == 0) param_track->frames[i][j] = 0.0;
	param_track->frames[i][j] +=
	CG_MODEL_VECTOR(cg_db,model_vectors[pm],f,(j)*fff)/
	(float)cg_db->num_param_models;
	}

	if (cg_db->mixed_excitation)
	{
	o = j;
	for (j=0; j<5; j++)
	{
	if (pm == 0) str_track->frames[i][j] = 0.0;
	str_track->frames[i][j] +=
	CG_MODEL_VECTOR(cg_db,model_vectors[pm],f,
	(o+(2j))fff) /
	(float)cg_db->num_param_models;
	}
	}

	/* last coefficient is average voicing for cluster */
	voicing /= (float)(pm+1);
	voicing +=
	CG_MODEL_VECTOR(cg_db,model_vectors[pm],f,
	cg_db->num_channels[pm]-2) /
	(float)(pm+1);
	}
	item_set_float(mcep,"voicing",voicing);
	/* Apply local gain to c0 */
	param_track->frames[i][2] *= local_gain;

	param_track->times[i] = i * cg_db->frame_advance;
	}

	cg_smooth_F0(utt,cg_db,param_track);

	utt_set_feat(utt,"param_track",track_val(param_track));
	if (cg_db->mixed_excitation)
	utt_set_feat(utt,"str_track",track_val(str_track));

	return utt;
	}

	static cst_utterance cg_resynth(cst_utterance utt)
	{
	cst_cg_db *cg_db;
	cst_wave *w;
	cst_track *param_track;
	cst_track *str_track = NULL;
	cst_track *smoothed_track;
	const cst_val *streaming_info_val;
	cst_audio_streaming_info *asi = NULL;

	streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
	if (streaming_info_val)
	{
	asi = val_audio_streaming_info(streaming_info_val);
	asi->utt = utt;
	}

	cg_db = val_cg_db(utt_feat_val(utt,"cg_db"));
	param_track = val_track(utt_feat_val(utt,"param_track"));
	if (cg_db->mixed_excitation)
	str_track = val_track(utt_feat_val(utt,"str_track"));

	if (cg_db->do_mlpg)
	{
	smoothed_track = mlpg(param_track, cg_db);
	w = mlsa_resynthesis(smoothed_track,str_track,cg_db,asi);
	delete_track(smoothed_track);
	}
	else
	w=mlsa_resynthesis(param_track,str_track,cg_db,asi);

	if (w == NULL)
	{
	/* Synthesis Failed, probably because it was interrupted */
	utt_set_feat_int(utt,"Interrupted",1);
	w = new_wave();
	}

	#if 0
	/* Apply local gain */
	for (i=0,tok=utt_rel_head(utt,"Token"); tok; i++,tok=item_next(tok))
	{
	if (item_feat_present(tok,"local_gain"))
	local_gain = item_feat_float(tokget_param_fffeature_float(tok,"R:mcep_link.parent.R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.local_gain");

	}
	#endif


	utt_set_wave(utt,w);

	return utt;
	}