blob: 0619a4b6536cb87296b9846b0dcbd1a9179d53fa [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 2000 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: September 2000 */
/*************************************************************************/
/* */
/* General synthesis control */
/* */
/*************************************************************************/
#include "cst_hrg.h"
#include "cst_cart.h"
#include "cst_tokenstream.h"
#include "cst_utt_utils.h"
#include "cst_lexicon.h"
#include "cst_units.h"
#include "cst_synth.h"
#include "cst_phoneset.h"
CST_VAL_REGISTER_FUNCPTR(breakfunc,cst_breakfunc)
#ifndef SYNTH_MODULES_DEBUG
#define SYNTH_MODULES_DEBUG 0
#endif
#if SYNTH_MODULES_DEBUG > 0
#define DPRINTF(l,x) if (SYNTH_MODULES_DEBUG > l) cst_dbgmsg x
#else
#define DPRINTF(l,x)
#endif
static cst_utterance *tokentosegs(cst_utterance *u);
static const cst_synth_module synth_method_text[] = {
{ "tokenizer_func", default_tokenization },
{ "textanalysis_func", default_textanalysis },
{ "pos_tagger_func", default_pos_tagger },
{ "phrasing_func", default_phrasing },
{ "lexical_insertion_func", default_lexical_insertion },
{ "pause_insertion_func", default_pause_insertion },
{ "intonation_func", cart_intonation },
{ "postlex_func", NULL },
{ "duration_model_func", cart_duration },
{ "f0_model_func", NULL },
{ "wave_synth_func", NULL },
{ "post_synth_hook_func", NULL },
{ NULL, NULL }
};
static const cst_synth_module synth_method_text2segs[] = {
{ "tokenizer_func", default_tokenization },
{ "textanalysis_func", default_textanalysis },
{ "pos_tagger_func", default_pos_tagger },
{ "phrasing_func", default_phrasing },
{ "lexical_insertion_func", default_lexical_insertion },
{ "pause_insertion_func", default_pause_insertion },
{ NULL, NULL }
};
static const cst_synth_module synth_method_tokens[] = {
{ "textanalysis_func", default_textanalysis },
{ "pos_tagger_func", default_pos_tagger },
{ "phrasing_func", default_phrasing },
{ "lexical_insertion_func", default_lexical_insertion },
{ "pause_insertion_func", default_pause_insertion },
{ "intonation_func", cart_intonation },
{ "postlex_func", NULL },
{ "duration_model_func", cart_duration },
{ "f0_model_func", NULL },
{ "wave_synth_func", NULL },
{ "post_synth_hook_func", NULL },
{ NULL, NULL }
};
static const cst_synth_module synth_method_phones[] = {
{ "tokenizer_func", default_tokenization },
{ "textanalysis_func", tokentosegs },
{ "pos_tagger_func", default_pos_tagger },
{ "intonation_func", NULL },
{ "duration_model_func", cart_duration },
{ "f0_model_func", flat_prosody },
{ "wave_synth_func", NULL },
{ "post_synth_hook_func", NULL },
{ NULL, NULL }
};
cst_utterance *utt_synth_wave(cst_wave *w,cst_voice *v)
{
/* Create an utterance with a wave in it as if we've synthesized it */
/* Put it through streaming if that is require */
cst_utterance *u;
const cst_val *streaming_info_val;
cst_audio_streaming_info *asi = NULL;
u = new_utterance();
utt_init(u,v);
utt_set_wave(u,w);
streaming_info_val=get_param_val(u->features,"streaming_info",NULL);
if (streaming_info_val)
{
asi = val_audio_streaming_info(streaming_info_val);
asi->utt = u;
}
if (!asi) return u; /* no stream */
/* Do streaming */
(*asi->asc)(w,0,w->num_samples,1,asi);
return u;
}
cst_utterance *apply_synth_module(cst_utterance *u,
const cst_synth_module *mod)
{
const cst_val *v;
v = feat_val(u->features, mod->hookname);
if (v)
return (*val_uttfunc(v))(u);
if (mod->defhook)
return (*mod->defhook)(u);
return u;
}
cst_utterance *apply_synth_method(cst_utterance *u,
const cst_synth_module meth[])
{
while (meth->hookname)
{
if ((u = apply_synth_module(u, meth)) == NULL)
return NULL;
++meth;
}
return u;
}
cst_utterance *utt_init(cst_utterance *u, cst_voice *vox)
{
/* Link the vox features into the utterance features so the voice */
/* features will be searched too (after the utt ones) */
feat_link_into(vox->features,u->features);
feat_link_into(vox->ffunctions,u->ffunctions);
/* Do the initialization function, if there is one */
if (vox->utt_init)
vox->utt_init(u, vox);
return u;
}
cst_utterance *utt_synth(cst_utterance *u)
{
return apply_synth_method(u, synth_method_text);
}
cst_utterance *utt_synth_tokens(cst_utterance *u)
{
return apply_synth_method(u, synth_method_tokens);
}
cst_utterance *utt_synth_text2segs(cst_utterance *u)
{
return apply_synth_method(u, synth_method_text2segs);
}
cst_utterance *utt_synth_phones(cst_utterance *u)
{
return apply_synth_method(u, synth_method_phones);
}
cst_utterance *default_tokenization(cst_utterance *u)
{
const char *text,*token;
cst_tokenstream *fd;
cst_item *t;
cst_relation *r;
text = utt_input_text(u);
r = utt_relation_create(u,"Token");
fd = ts_open_string(text,
get_param_string(u->features,"text_whitespace",NULL),
get_param_string(u->features,"text_singlecharsymbols",NULL),
get_param_string(u->features,"text_prepunctuation",NULL),
get_param_string(u->features,"text_postpunctuation",NULL));
while(!ts_eof(fd))
{
token = ts_get(fd);
if (cst_strlen(token) > 0)
{
t = relation_append(r,NULL);
item_set_string(t,"name",token);
item_set_string(t,"whitespace",fd->whitespace);
item_set_string(t,"prepunctuation",fd->prepunctuation);
item_set_string(t,"punc",fd->postpunctuation);
item_set_int(t,"file_pos",fd->file_pos);
item_set_int(t,"line_number",fd->line_number);
}
}
ts_close(fd);
return u;
}
cst_val *default_tokentowords(cst_item *i)
{
return cons_val(string_val(item_feat_string(i,"name")), NULL);
}
cst_utterance *default_textanalysis(cst_utterance *u)
{
cst_item *t,*word;
cst_relation *word_rel;
cst_val *words;
const cst_val *w;
const cst_val *ttwv;
word_rel = utt_relation_create(u,"Word");
ttwv = feat_val(u->features, "tokentowords_func");
for (t=relation_head(utt_relation(u,"Token")); t; t=item_next(t))
{
if (ttwv)
words = (cst_val *)(*val_itemfunc(ttwv))(t);
else
words = default_tokentowords(t);
for (w=words; w; w=val_cdr(w))
{
word = item_add_daughter(t,NULL);
if (cst_val_consp(val_car(w)))
{ /* Has extra features */
item_set_string(word,"name",val_string(val_car(val_car(w))));
feat_copy_into(val_features(val_cdr(val_car(w))),
item_feats(word));
}
else
item_set_string(word,"name",val_string(val_car(w)));
relation_append(word_rel,word);
}
delete_val(words);
}
return u;
}
cst_utterance *default_phrasing(cst_utterance *u)
{
cst_relation *r;
cst_item *w, *p, *lp=NULL;
const cst_val *v;
cst_cart *phrasing_cart;
r = utt_relation_create(u,"Phrase");
if (feat_present(u->features,"phrasing_cart"))
phrasing_cart = val_cart(feat_val(u->features,"phrasing_cart"));
else
phrasing_cart = NULL;
for (p=NULL,w=relation_head(utt_relation(u,"Word")); w; w=item_next(w))
{
if (p == NULL)
{
p = relation_append(r,NULL);
lp = p;
item_set_string(p,"name","B");
}
item_add_daughter(p,w);
if (phrasing_cart)
{
v = cart_interpret(w,phrasing_cart);
if (cst_streq(val_string(v),"BB"))
p = NULL;
}
}
if (lp && item_prev(lp)) /* follow festival */
item_set_string(lp,"name","BB");
return u;
}
cst_utterance *default_pause_insertion(cst_utterance *u)
{
/* Add initial silences and silence at each phrase break */
const char *silence;
const cst_item *w;
cst_item *p, *s;
silence = val_string(feat_val(u->features,"silence"));
/* Insert initial silence */
s = relation_head(utt_relation(u,"Segment"));
if (s == NULL)
s = relation_append(utt_relation(u,"Segment"),NULL);
else
s = item_prepend(s,NULL);
item_set_string(s,"name",silence);
for (p=relation_head(utt_relation(u,"Phrase")); p; p=item_next(p))
{
for (w = item_last_daughter(p); w; w=item_prev(w))
{
s = path_to_item(w,"R:SylStructure.daughtern.daughtern.R:Segment");
if (s)
{
s = item_append(s,NULL);
item_set_string(s,"name",silence);
break;
}
}
}
return u;
}
cst_utterance *cart_intonation(cst_utterance *u)
{
cst_cart *accents, *tones;
cst_item *s;
const cst_val *v;
if (feat_present(u->features,"no_intonation_accent_model"))
return u; /* not all languages have intonation models */
accents = val_cart(feat_val(u->features,"int_cart_accents"));
tones = val_cart(feat_val(u->features,"int_cart_tones"));
for (s=relation_head(utt_relation(u,"Syllable")); s; s=item_next(s))
{
v = cart_interpret(s,accents);
if (!cst_streq("NONE",val_string(v)))
item_set_string(s,"accent",val_string(v));
v = cart_interpret(s,tones);
if (!cst_streq("NONE",val_string(v)))
item_set_string(s,"endtone",val_string(v));
DPRINTF(0,("word %s gpos %s stress %s ssyl_in %s ssyl_out %s accent %s endtone %s\n",
ffeature_string(s,"R:SylStructure.parent.name"),
ffeature_string(s,"R:SylStructure.parent.gpos"),
ffeature_string(s,"stress"),
ffeature_string(s,"ssyl_in"),
ffeature_string(s,"ssyl_out"),
ffeature_string(s,"accent"),
ffeature_string(s,"endtone")));
}
return u;
}
CST_VAL_REGISTER_TYPE_NODEL(dur_stats,dur_stats)
const dur_stat *phone_dur_stat(const dur_stats *ds,const char *ph)
{
int i;
for (i=0; ds[i]; i++)
if (cst_streq(ph,ds[i]->phone))
return ds[i];
return ds[0];
}
cst_utterance *cart_duration(cst_utterance *u)
{
cst_cart *dur_tree;
cst_item *s;
float zdur, dur_stretch, local_dur_stretch, dur;
float end;
dur_stats *ds;
const dur_stat *dur_stat;
end = 0;
if (feat_present(u->features,"no_segment_duration_model"))
return u; /* not all methods need segment durations */
dur_tree = val_cart(feat_val(u->features,"dur_cart"));
dur_stretch = get_param_float(u->features,"duration_stretch", 1.0);
ds = val_dur_stats(feat_val(u->features,"dur_stats"));
for (s=relation_head(utt_relation(u,"Segment")); s; s=item_next(s))
{
zdur = val_float(cart_interpret(s,dur_tree));
dur_stat = phone_dur_stat(ds,item_name(s));
local_dur_stretch = ffeature_float(s, "R:SylStructure.parent.parent."
"R:Token.parent.local_duration_stretch");
if (local_dur_stretch)
local_dur_stretch *= dur_stretch;
else
local_dur_stretch = dur_stretch;
dur = local_dur_stretch * ((zdur*dur_stat->stddev)+dur_stat->mean);
DPRINTF(0,("phone %s accent %s stress %s pdur %f stretch %f mean %f std %f dur %f\n",
item_name(s),
ffeature_string(s,"R:SylStructure.parent.accented"),
ffeature_string(s,"R:SylStructure.parent.stress"),
zdur, local_dur_stretch, dur_stat->mean,
dur_stat->stddev, dur));
end += dur;
item_set_float(s,"end",end);
}
return u;
}
cst_utterance *default_pos_tagger(cst_utterance *u)
{
cst_item *word;
const cst_val *p;
const cst_cart *tagger;
p = get_param_val(u->features,"pos_tagger_cart",NULL);
if (p == NULL)
return u;
tagger = val_cart(p);
for (word=relation_head(utt_relation(u,"Word"));
word; word=item_next(word))
{
p = cart_interpret(word,tagger);
item_set_string(word,"pos",val_string(p));
}
return u;
}
cst_utterance *default_lexical_insertion(cst_utterance *u)
{
cst_item *word;
cst_relation *sylstructure,*seg,*syl;
cst_lexicon *lex;
const cst_val *lex_addenda = NULL;
const cst_val *p, *wp = NULL;
char *phone_name;
const char *stress = "0";
const char *pos;
cst_val *phones;
cst_item *ssword, *sssyl, *segitem, *sylitem, *seg_in_syl;
const cst_val *vpn;
int dp = 0;
lex = val_lexicon(feat_val(u->features,"lexicon"));
if (lex->lex_addenda)
lex_addenda = lex->lex_addenda;
syl = utt_relation_create(u,"Syllable");
sylstructure = utt_relation_create(u,"SylStructure");
seg = utt_relation_create(u,"Segment");
for (word=relation_head(utt_relation(u,"Word"));
word; word=item_next(word))
{
ssword = relation_append(sylstructure,word);
pos = ffeature_string(word,"pos");
phones = NULL;
wp = NULL;
dp = 0; /* should the phones get deleted or not */
/* printf("awb_debug word %s pos %s gpos %s\n",
item_feat_string(word,"name"),
pos,
ffeature_string(word,"gpos")); */
/* FIXME: need to make sure that textanalysis won't split
tokens with explicit pronunciation (or that it will
propagate such to words, then we can remove the path here) */
if (item_feat_present(item_parent(item_as(word, "Token")), "phones"))
{
vpn = item_feat(item_parent(item_as(word, "Token")), "phones");
if (cst_val_consp(vpn))
{ /* for SAPI ?? */
/* awb oct11: this seems wrong -- */
/* not sure SAPI still (ever) works Oct11 */
phones = (cst_val *) vpn;
}
else
{
dp = 1;
if (cst_streq(val_string(vpn),
ffeature_string(word,"p.R:Token.parent.phones")))
phones = NULL; /* Already given these phones */
else
phones = val_readlist_string(val_string(vpn));
}
}
else
{
wp = val_assoc_string(item_feat_string(word, "name"),lex_addenda);
if (wp)
phones = (cst_val *)val_cdr(val_cdr(wp));
else
{
dp = 1;
phones = lex_lookup(lex,item_feat_string(word,"name"),pos,
u->features);
}
}
for (sssyl=NULL,sylitem=NULL,p=phones; p; p=val_cdr(p))
{
if (sylitem == NULL)
{
sylitem = relation_append(syl,NULL);
sssyl = item_add_daughter(ssword,sylitem);
stress = "0";
}
segitem = relation_append(seg,NULL);
phone_name = cst_strdup(val_string(val_car(p)));
if (phone_name[cst_strlen(phone_name)-1] == '1')
{
stress = "1";
phone_name[cst_strlen(phone_name)-1] = '\0';
}
else if (phone_name[cst_strlen(phone_name)-1] == '0')
{
stress = "0";
phone_name[cst_strlen(phone_name)-1] = '\0';
}
item_set_string(segitem,"name",phone_name);
seg_in_syl = item_add_daughter(sssyl,segitem);
#if 0
printf("awb_debug ph %s\n",phone_name);
#endif
if ((lex->syl_boundary)(seg_in_syl,val_cdr(p)))
{
#if 0
printf("awb_debug SYL\n");
#endif
sylitem = NULL;
if (sssyl)
item_set_string(sssyl,"stress",stress);
}
cst_free(phone_name);
}
if (dp)
{
delete_val(phones);
phones = NULL;
}
}
return u;
}
/* Dummy F0 modelling for phones, copied directly from us_f0_model.c */
cst_utterance *flat_prosody(cst_utterance *u)
{
/* F0 target model */
cst_item *s,*t;
cst_relation *targ_rel;
float mean, stddev;
targ_rel = utt_relation_create(u,"Target");
mean = get_param_float(u->features,"target_f0_mean", 100.0);
mean *= get_param_float(u->features,"f0_shift", 1.0);
stddev = get_param_float(u->features,"target_f0_stddev", 12.0);
s=relation_head(utt_relation(u,"Segment"));
t = relation_append(targ_rel,NULL);
item_set_float(t,"pos",0.0);
item_set_float(t,"f0",mean+stddev);
s=relation_tail(utt_relation(u,"Segment"));
t = relation_append(targ_rel,NULL);
item_set_float(t,"pos",item_feat_float(s,"end"));
item_set_float(t,"f0",mean-stddev);
return u;
}
static cst_utterance *tokentosegs(cst_utterance *u)
{
cst_item *t;
cst_relation *seg, *syl, *sylstructure, *word;
cst_item *sylitem, *sylstructureitem, *worditem, *sssyl;
cst_phoneset *ps;
ps = val_phoneset(utt_feat_val(u, "phoneset"));
/* Just copy tokens into the Segment relation */
seg = utt_relation_create(u, "Segment");
syl = utt_relation_create(u, "Syllable");
word = utt_relation_create(u, "Word");
sylstructure = utt_relation_create(u, "SylStructure");
sssyl = sylitem = worditem = sylstructureitem = 0;
for (t = relation_head(utt_relation(u, "Token")); t; t = item_next(t))
{
cst_item *segitem = relation_append(seg, NULL);
char const *pname = item_feat_string(t, "name");
char *name = cst_strdup(pname);
if (worditem == 0)
{
worditem = relation_append(word,NULL);
item_set_string(worditem, "name", "phonestring");
sylstructureitem = relation_append(sylstructure,worditem);
}
if (sylitem == 0)
{
sylitem = relation_append(syl,NULL);
sssyl = item_add_daughter(sylstructureitem,sylitem);
}
if (name[cst_strlen(name)-1] == '1')
{
item_set_string(sssyl,"stress","1");
name[cst_strlen(name)-1] = '\0';
}
else if (name[cst_strlen(name)-1] == '0')
{
item_set_string(sssyl,"stress","0");
name[cst_strlen(name)-1] = '\0';
}
if (cst_streq(name,"-"))
{
sylitem = 0; /* syllable break */
}
else if (phone_id(ps, name) == -1)
{
cst_errmsg("Phone `%s' not in phoneset\n", pname);
cst_error();
}
else
{
item_add_daughter(sssyl,segitem);
item_set_string(segitem, "name", name);
}
cst_free(name);
}
return u;
}
int default_utt_break(cst_tokenstream *ts,
const char *token,
cst_relation *tokens)
{
/* This is the default utt break functions, languages may override this */
/* This will be ok for some latin based languages */
const char *postpunct = item_feat_string(relation_tail(tokens), "punc");
const char *ltoken = item_name(relation_tail(tokens));
if (cst_strchr(ts->whitespace,'\n') != cst_strrchr(ts->whitespace,'\n'))
/* contains two new lines */
return TRUE;
/* Well, this is a little specific isn't it. */
else if (((cst_streq(ltoken,"Yahoo")) ||
(cst_streq(ltoken,"YAHOO")) ||
(cst_streq(ltoken,"yahoo"))) &&
strchr(postpunct,'!') &&
strchr("abcdefghijklmnopqrstuvwxyz",token[0]))
return FALSE;
else if (strchr(postpunct,':') ||
strchr(postpunct,'?') ||
strchr(postpunct,'!'))
return TRUE;
else if (strchr(postpunct,'.') &&
(cst_strlen(ts->whitespace) > 1) &&
strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]))
return TRUE;
else if (strchr(postpunct,'.') &&
/* next word starts with a capital */
strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",token[0]) &&
/* last word isn't an abbreviation */
!(strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[cst_strlen(ltoken)-1])||
((cst_strlen(ltoken) < 4) &&
strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ltoken[0]))))
return TRUE;
else
return FALSE;
}