| /* --------------------------------------------------------------- */ |
| /* The HMM-Based Speech Synthesis System (HTS): version 1.1b */ |
| /* HTS Working Group */ |
| /* */ |
| /* Department of Computer Science */ |
| /* Nagoya Institute of Technology */ |
| /* and */ |
| /* Interdisciplinary Graduate School of Science and Engineering */ |
| /* Tokyo Institute of Technology */ |
| /* Copyright (c) 2001-2003 */ |
| /* All Rights Reserved. */ |
| /* */ |
| /* Permission is hereby granted, free of charge, to use and */ |
| /* distribute this software and its documentation without */ |
| /* restriction, including without limitation the rights to use, */ |
| /* copy, modify, merge, publish, distribute, sublicense, and/or */ |
| /* sell copies of this work, and to permit persons to whom this */ |
| /* work is furnished to do so, subject to the following conditions: */ |
| /* */ |
| /* 1. The code must retain the above copyright notice, this list */ |
| /* of conditions and the following disclaimer. */ |
| /* */ |
| /* 2. Any modifications must be clearly marked as such. */ |
| /* */ |
| /* NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSITITUTE OF TECHNOLOGY, */ |
| /* HTS WORKING GROUP, AND THE CONTRIBUTORS TO THIS WORK DISCLAIM */ |
| /* ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL */ |
| /* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
| /* SHALL NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSITITUTE OF */ |
| /* TECHNOLOGY, HTS WORKING GROUP, NOR THE CONTRIBUTORS BE LIABLE */ |
| /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY */ |
| /* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, */ |
| /* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS */ |
| /* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR */ |
| /* PERFORMANCE OF THIS SOFTWARE. */ |
| /* */ |
| /* --------------------------------------------------------------- */ |
| /* This is Zen's MLSA filter as ported by Toda to festvox vc */ |
| /* and back ported into hts/festival so we can do MLSA filtering */ |
| /* If I took more time I could probably make this use the same as */ |
| /* as the other code in this directory -- awb@cs.cmu.edu 03JAN06 */ |
| /* --------------------------------------------------------------- */ |
| /* and then ported into Flite (November 2007 awb@cs.cmu.edu) */ |
| /* with some speed uptimizations */ |
| |
| /*********************************************************************/ |
| /* */ |
| /* Mel-cepstral vocoder (pulse/noise excitation & MLSA filter) */ |
| /* 2003/12/26 by Heiga Zen */ |
| /* */ |
| /* Extracted from HTS and slightly modified */ |
| /* by Tomoki Toda (tomoki@ics.nitech.ac.jp) */ |
| /* June 2004 */ |
| /* Integrate as a Voice Conversion module */ |
| /* */ |
| /*-------------------------------------------------------------------*/ |
| |
| #include "cst_alloc.h" |
| #include "cst_string.h" |
| #include "cst_math.h" |
| #include "cst_track.h" |
| #include "cst_wave.h" |
| #include "cst_audio.h" |
| |
| #ifdef ANDROID |
| #define SPEED_HACK |
| #endif |
| |
| #ifdef UNDER_CE |
| #define SPEED_HACK |
| /* This is one of those other things you shouldn't do, but it makes |
| CG voices in flowm fast enough on my phone */ |
| #define double float |
| #endif |
| |
| #include "cst_vc.h" |
| #include "cst_cg.h" |
| #include "cst_mlsa.h" |
| |
| |
| static cst_wave *synthesis_body(const cst_track *params, |
| const cst_track *str, |
| double fs, double framem, |
| cst_cg_db *cg_db, |
| cst_audio_streaming_info *asi); |
| |
| cst_wave *mlsa_resynthesis(const cst_track *params, |
| const cst_track *str, |
| cst_cg_db *cg_db, |
| cst_audio_streaming_info *asi) |
| { |
| /* Resynthesizes a wave from given track */ |
| cst_wave *wave = 0; |
| int sr = cg_db->sample_rate; |
| double shift; |
| |
| if (params->num_frames > 1) |
| shift = 1000.0*(params->times[1]-params->times[0]); |
| else |
| shift = 5.0; |
| |
| wave = synthesis_body(params,str,sr,shift,cg_db,asi); |
| |
| return wave; |
| } |
| |
| static cst_wave *synthesis_body(const cst_track *params, /* f0 + mcep */ |
| const cst_track *str, |
| double fs, /* sampling frequency (Hz) */ |
| double framem, /* frame size */ |
| cst_cg_db *cg_db, |
| cst_audio_streaming_info *asi) |
| { |
| long t, pos; |
| int framel, i; |
| double f0; |
| VocoderSetup vs; |
| cst_wave *wave = 0; |
| double *mcep; |
| int stream_mark; |
| int rc = CST_AUDIO_STREAM_CONT; |
| int num_mcep; |
| double ffs = fs; |
| |
| num_mcep = params->num_channels-1; |
| /* For SPEED_HACK we could reduce num_mcep, and it will run faster */ |
| /* num_mcep -= 10; */ |
| framel = (int)(0.5 + (framem * ffs / 1000.0)); /* 80 for 16KHz */ |
| init_vocoder(ffs, framel, num_mcep, &vs, cg_db); |
| |
| if (str != NULL) |
| vs.gauss = MFALSE; |
| |
| /* synthesize waveforms by MLSA filter */ |
| wave = new_wave(); |
| cst_wave_resize(wave,params->num_frames * framel,1); |
| wave->sample_rate = fs; |
| |
| mcep = cst_alloc(double,num_mcep+1); |
| |
| for (t = 0, stream_mark = pos = 0; |
| (rc == CST_AUDIO_STREAM_CONT) && (t < params->num_frames); |
| t++) |
| { |
| f0 = (double)params->frames[t][0]; |
| for (i=1; i<num_mcep+1; i++) |
| mcep[i-1] = params->frames[t][i]; |
| mcep[i-1] = 0; |
| |
| if (str) |
| vocoder(f0, mcep, str->frames[t], num_mcep, cg_db, &vs, wave, &pos); |
| else |
| vocoder(f0, mcep, NULL, num_mcep, cg_db, &vs, wave, &pos); |
| |
| if (asi && (pos-stream_mark > asi->min_buffsize)) |
| { |
| rc=(*asi->asc)(wave,stream_mark,pos-stream_mark,0,asi); |
| stream_mark = pos; |
| } |
| } |
| wave->num_samples = pos; |
| |
| if (asi && (rc == CST_AUDIO_STREAM_CONT)) |
| { /* drain the last part of the waveform */ |
| (*asi->asc)(wave,stream_mark,pos-stream_mark,1,asi); |
| } |
| |
| /* memory free */ |
| cst_free(mcep); |
| free_vocoder(&vs); |
| |
| if (rc == CST_AUDIO_STREAM_STOP) |
| { |
| delete_wave(wave); |
| return NULL; |
| } |
| else |
| return wave; |
| } |
| |
| static void init_vocoder(double fs, int framel, int m, |
| VocoderSetup *vs, cst_cg_db *cg_db) |
| { |
| /* initialize global parameter */ |
| vs->fprd = framel; |
| vs->iprd = 1; |
| vs->seed = 1; |
| #ifdef SPEED_HACK |
| /* This makes it about 25% faster and sounds basically the same */ |
| vs->pd = 4; |
| #else |
| vs->pd = 5; |
| #endif |
| |
| vs->next =1; |
| vs->gauss = MTRUE; |
| |
| /* Pade' approximants */ |
| vs->pade[ 0]=1.0; |
| vs->pade[ 1]=1.0; vs->pade[ 2]=0.0; |
| vs->pade[ 3]=1.0; vs->pade[ 4]=0.0; vs->pade[ 5]=0.0; |
| vs->pade[ 6]=1.0; vs->pade[ 7]=0.0; vs->pade[ 8]=0.0; vs->pade[ 9]=0.0; |
| vs->pade[10]=1.0; vs->pade[11]=0.4999273; vs->pade[12]=0.1067005; vs->pade[13]=0.01170221; vs->pade[14]=0.0005656279; |
| vs->pade[15]=1.0; vs->pade[16]=0.4999391; vs->pade[17]=0.1107098; vs->pade[18]=0.01369984; vs->pade[19]=0.0009564853; |
| vs->pade[20]=0.00003041721; |
| |
| vs->rate = fs; |
| vs->c = cst_alloc(double,3 * (m + 1) + 3 * (vs->pd + 1) + vs->pd * (m + 2)); |
| |
| vs->p1 = -1; |
| vs->sw = 0; |
| vs->x = 0x55555555; |
| |
| /* for postfiltering */ |
| vs->mc = NULL; |
| vs->o = 0; |
| vs->d = NULL; |
| vs->irleng= 64; |
| |
| // for MIXED EXCITATION |
| vs->ME_order = cg_db->ME_order; |
| vs->ME_num = cg_db->ME_num; |
| vs->hpulse = cst_alloc(double,vs->ME_order); |
| vs->hnoise = cst_alloc(double,vs->ME_order); |
| vs->xpulsesig = cst_alloc(double,vs->ME_order); |
| vs->xnoisesig = cst_alloc(double,vs->ME_order); |
| vs->h = cg_db->me_h; |
| |
| return; |
| } |
| |
| static double plus_or_minus_one() |
| { |
| /* Randomly return 1 or -1 */ |
| /* not sure rand() is portable */ |
| if (rand() > RAND_MAX/2.0) |
| return 1.0; |
| else |
| return -1.0; |
| } |
| |
| static void vocoder(double p, double *mc, |
| const float *str, |
| int m, cst_cg_db *cg_db, |
| VocoderSetup *vs, cst_wave *wav, long *pos) |
| { |
| double inc, x, e1, e2; |
| int i, j, k; |
| double xpulse, xnoise; |
| double fxpulse, fxnoise; |
| float gain=1.0; |
| |
| if (cg_db->gain != 0.0) |
| gain = cg_db->gain; |
| |
| if (str != NULL) /* MIXED-EXCITATION */ |
| { |
| /* Copy in str's and build hpulse and hnoise for this frame */ |
| for (i=0; i<vs->ME_order; i++) |
| { |
| vs->hpulse[i] = vs->hnoise[i] = 0.0; |
| for (j=0; j<vs->ME_num; j++) |
| { |
| vs->hpulse[i] += str[j] * vs->h[j][i]; |
| vs->hnoise[i] += (1 - str[j]) * vs->h[j][i]; |
| } |
| } |
| } |
| |
| if (p != 0.0) |
| p = vs->rate / p; /* f0 -> pitch */ |
| |
| if (vs->p1 < 0) { |
| if (vs->gauss & (vs->seed != 1)) |
| vs->next = srnd((unsigned)vs->seed); |
| |
| vs->p1 = p; |
| vs->pc = vs->p1; |
| vs->cc = vs->c + m + 1; |
| vs->cinc = vs->cc + m + 1; |
| vs->d1 = vs->cinc + m + 1; |
| |
| mc2b(mc, vs->c, m, cg_db->mlsa_alpha); |
| |
| if (cg_db->mlsa_beta > 0.0 && m > 1) { |
| e1 = b2en(vs->c, m, cg_db->mlsa_alpha, vs); |
| vs->c[1] -= cg_db->mlsa_beta * cg_db->mlsa_alpha * mc[2]; |
| for (k=2;k<=m;k++) |
| vs->c[k] *= (1.0 + cg_db->mlsa_beta); |
| e2 = b2en(vs->c, m, cg_db->mlsa_alpha, vs); |
| vs->c[0] += log(e1/e2)/2; |
| } |
| |
| return; |
| } |
| |
| mc2b(mc, vs->cc, m, cg_db->mlsa_alpha); |
| if (cg_db->mlsa_beta>0.0 && m > 1) { |
| e1 = b2en(vs->cc, m, cg_db->mlsa_alpha, vs); |
| vs->cc[1] -= cg_db->mlsa_beta * cg_db->mlsa_alpha * mc[2]; |
| for (k = 2; k <= m; k++) |
| vs->cc[k] *= (1.0 + cg_db->mlsa_beta); |
| e2 = b2en(vs->cc, m, cg_db->mlsa_alpha, vs); |
| vs->cc[0] += log(e1 / e2) / 2.0; |
| } |
| |
| for (k=0; k<=m; k++) |
| vs->cinc[k] = (vs->cc[k] - vs->c[k]) * |
| (double)vs->iprd / (double)vs->fprd; |
| |
| if (vs->p1!=0.0 && p!=0.0) { |
| inc = (p - vs->p1) * (double)vs->iprd / (double)vs->fprd; |
| } else { |
| inc = 0.0; |
| vs->pc = p; |
| vs->p1 = 0.0; |
| } |
| |
| for (j = vs->fprd, i = (vs->iprd + 1) / 2; j--;) { |
| if (vs->p1 == 0.0) { |
| if (vs->gauss) |
| x = (double) nrandom(vs); |
| else |
| x = plus_or_minus_one(); |
| if (str != NULL) /* MIXED EXCITATION */ |
| { |
| xnoise = x; |
| xpulse = 0.0; |
| } |
| } else { |
| if ((vs->pc += 1.0) >= vs->p1) { |
| x = sqrt (vs->p1); |
| vs->pc = vs->pc - vs->p1; |
| } else |
| x = 0.0; |
| |
| if (str != NULL) /* MIXED EXCITATION */ |
| { |
| xpulse = x; |
| xnoise = plus_or_minus_one(); |
| } |
| } |
| |
| /* MIXED EXCITATION */ |
| /* The real work -- apply shaping filters to pulse and noise */ |
| if (str != NULL) |
| { |
| fxpulse = fxnoise = 0.0; |
| for (k=vs->ME_order-1; k>0; k--) |
| { |
| fxpulse += vs->hpulse[k] * vs->xpulsesig[k]; |
| fxnoise += vs->hnoise[k] * vs->xnoisesig[k]; |
| |
| vs->xpulsesig[k] = vs->xpulsesig[k-1]; |
| vs->xnoisesig[k] = vs->xnoisesig[k-1]; |
| } |
| fxpulse += vs->hpulse[0] * xpulse; |
| fxnoise += vs->hnoise[0] * xnoise; |
| vs->xpulsesig[0] = xpulse; |
| vs->xnoisesig[0] = xnoise; |
| |
| x = fxpulse + fxnoise; /* excitation is pulse plus noise */ |
| } |
| |
| if (cg_db->sample_rate == 8000) |
| /* 8KHz voices are too quiet: this is probably not general */ |
| x *= exp(vs->c[0])*2.0; |
| else |
| x *= exp(vs->c[0])*gain; |
| |
| x = mlsadf(x, vs->c, m, cg_db->mlsa_alpha, vs->pd, vs->d1, vs); |
| |
| wav->samples[*pos] = (short)x; |
| *pos += 1; |
| |
| if (!--i) { |
| vs->p1 += inc; |
| for (k = 0; k <= m; k++) vs->c[k] += vs->cinc[k]; |
| i = vs->iprd; |
| } |
| } |
| |
| vs->p1 = p; |
| memmove(vs->c,vs->cc,sizeof(double)*(m+1)); |
| |
| return; |
| } |
| |
| static double mlsadf(double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs) |
| { |
| |
| vs->ppade = &(vs->pade[pd*(pd+1)/2]); |
| |
| x = mlsadf1 (x, b, m, a, pd, d, vs); |
| x = mlsadf2 (x, b, m, a, pd, &d[2*(pd+1)], vs); |
| |
| return(x); |
| } |
| |
| static double mlsadf1(double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs) |
| { |
| double v, out = 0.0, *pt, aa; |
| register int i; |
| |
| aa = 1 - a*a; |
| pt = &d[pd+1]; |
| |
| for (i=pd; i>=1; i--) { |
| d[i] = aa*pt[i-1] + a*d[i]; |
| pt[i] = d[i] * b[1]; |
| v = pt[i] * vs->ppade[i]; |
| x += (1 & i) ? v : -v; |
| out += v; |
| } |
| |
| pt[0] = x; |
| out += x; |
| |
| return(out); |
| } |
| |
| static double mlsadf2 (double x, double *b, int m, double a, int pd, double *d, VocoderSetup *vs) |
| { |
| double v, out = 0.0, *pt; |
| // double aa; |
| register int i; |
| |
| //aa = 1 - a*a; |
| pt = &d[pd * (m+2)]; |
| |
| for (i=pd; i>=1; i--) { |
| pt[i] = mlsafir (pt[i-1], b, m, a, &d[(i-1)*(m+2)]); |
| |
| v = pt[i] * vs->ppade[i]; |
| |
| x += (1&i) ? v : -v; |
| out += v; |
| } |
| |
| pt[0] = x; |
| out += x; |
| |
| return(out); |
| } |
| |
| static double mlsafir (double x, double *b, int m, double a, double *d) |
| { |
| double y = 0.0; |
| double aa; |
| register int i; |
| |
| aa = 1 - a*a; |
| |
| d[0] = x; |
| d[1] = aa*d[0] + a*d[1]; |
| for (i=2; i<= m; i++) { |
| d[i] = d[i] + a*(d[i+1]-d[i-1]); |
| y += d[i]*b[i]; |
| } |
| |
| for (i=m+1; i>1; i--) |
| d[i] = d[i-1]; |
| |
| return(y); |
| } |
| |
| static double nrandom (VocoderSetup *vs) |
| { |
| if (vs->sw == 0) { |
| vs->sw = 1; |
| do { |
| vs->r1 = 2.0 * rnd(&vs->next) - 1.0; |
| vs->r2 = 2.0 * rnd(&vs->next) - 1.0; |
| vs->s = vs->r1 * vs->r1 + vs->r2 * vs->r2; |
| } while (vs->s > 1 || vs->s == 0); |
| |
| vs->s = sqrt (-2 * log(vs->s) / vs->s); |
| |
| return(vs->r1*vs->s); |
| } |
| else { |
| vs->sw = 0; |
| |
| return (vs->r2*vs->s); |
| } |
| } |
| |
| static double rnd (unsigned long *next) |
| { |
| double r; |
| |
| *next = *next * 1103515245L + 12345; |
| r = (*next / 65536L) % 32768L; |
| |
| return(r/RANDMAX); |
| } |
| |
| static unsigned long srnd ( unsigned long seed ) |
| { |
| return(seed); |
| } |
| |
| /* mc2b : transform mel-cepstrum to MLSA digital fillter coefficients */ |
| static void mc2b (double *mc, double *b, int m, double a) |
| { |
| b[m] = mc[m]; |
| |
| for (m--; m>=0; m--) |
| b[m] = mc[m] - a * b[m+1]; |
| |
| return; |
| } |
| |
| |
| static double b2en (double *b, int m, double a, VocoderSetup *vs) |
| { |
| double en; |
| int k; |
| |
| if (vs->o<m) { |
| if (vs->mc != NULL) |
| cst_free(vs->mc); |
| |
| vs->mc = cst_alloc(double,(m + 1) + 2 * vs->irleng); |
| vs->cep = vs->mc + m+1; |
| vs->ir = vs->cep + vs->irleng; |
| } |
| |
| b2mc(b, vs->mc, m, a); |
| freqt(vs->mc, m, vs->cep, vs->irleng-1, -a, vs); |
| c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng); |
| en = 0.0; |
| |
| for (k=0;k<vs->irleng;k++) |
| en += vs->ir[k] * vs->ir[k]; |
| |
| return(en); |
| } |
| |
| |
| /* b2bc : transform MLSA digital filter coefficients to mel-cepstrum */ |
| static void b2mc (double *b, double *mc, int m, double a) |
| { |
| double d, o; |
| |
| d = mc[m] = b[m]; |
| for (m--; m>=0; m--) { |
| o = b[m] + a * d; |
| d = b[m]; |
| mc[m] = o; |
| } |
| |
| return; |
| } |
| |
| /* freqt : frequency transformation */ |
| static void freqt (double *c1, int m1, double *c2, int m2, double a, VocoderSetup *vs) |
| { |
| register int i, j; |
| double b; |
| |
| if (vs->d==NULL) { |
| vs->size = m2; |
| vs->d = cst_alloc(double,vs->size + vs->size + 2); |
| vs->g = vs->d+vs->size+1; |
| } |
| |
| if (m2>vs->size) { |
| cst_free(vs->d); |
| vs->size = m2; |
| vs->d = cst_alloc(double,vs->size + vs->size + 2); |
| vs->g = vs->d+vs->size+1; |
| } |
| |
| b = 1-a*a; |
| for (i=0; i<m2+1; i++) |
| vs->g[i] = 0.0; |
| |
| for (i=-m1; i<=0; i++) { |
| if (0 <= m2) |
| vs->g[0] = c1[-i]+a*(vs->d[0]=vs->g[0]); |
| if (1 <= m2) |
| vs->g[1] = b*vs->d[0]+a*(vs->d[1]=vs->g[1]); |
| for (j=2; j<=m2; j++) |
| vs->g[j] = vs->d[j-1]+a*((vs->d[j]=vs->g[j])-vs->g[j-1]); |
| } |
| |
| memmove(c2,vs->g,sizeof(double)*(m2+1)); |
| |
| return; |
| } |
| |
| /* c2ir : The minimum phase impulse response is evaluated from the minimum phase cepstrum */ |
| static void c2ir (double *c, int nc, double *h, int leng) |
| { |
| register int n, k, upl; |
| double d; |
| |
| h[0] = exp(c[0]); |
| for (n=1; n<leng; n++) { |
| d = 0; |
| upl = (n>=nc) ? nc-1 : n; |
| for (k=1; k<=upl; k++) |
| d += k*c[k]*h[n-k]; |
| h[n] = d/n; |
| } |
| |
| return; |
| } |
| |
| static void free_vocoder(VocoderSetup *vs) |
| { |
| |
| cst_free(vs->c); |
| cst_free(vs->mc); |
| cst_free(vs->d); |
| |
| vs->c = NULL; |
| vs->mc = NULL; |
| vs->d = NULL; |
| vs->ppade = NULL; |
| vs->cc = NULL; |
| vs->cinc = NULL; |
| vs->d1 = NULL; |
| vs->g = NULL; |
| vs->cep = NULL; |
| vs->ir = NULL; |
| |
| cst_free(vs->hpulse); |
| cst_free(vs->hnoise); |
| cst_free(vs->xpulsesig); |
| cst_free(vs->xnoisesig); |
| |
| |
| return; |
| } |
| |