blob: 7e6486b58377727cc75c772c4fbc33bdc7fb8f40 [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 2001-2011 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: June 2008 */
/*************************************************************************/
/* */
/* SSML support for flite ( http://www.w3.org/TR/speech-synthesis/ ) */
/* */
/* We don't use a full XML parser here for space and availability */
/* reasons, but this is adequate for SSML */
/* This is based on some old SABLE support in flite that never got */
/* completed */
/* */
/* <ssml> </ssml> */
/* <voice ...> </voice> */
/* name or urls for voices */
/* <audio ...> </audio> */
/* <!-- ... --> */
/* <break .../> */
/* <prosody ...> </prosody> rate volume (no pitch yet) */
/* <emphasis ...> </emphasis> */
/* <sub alias="World Wide Web Consortium">W3C</sub> */
/* <phoneme ph="x x x"> </phoneme> */
/* */
/* <...> ignore all others */
/* */
/* Voice call backs (e.g. -pw and -ps) are not transfered when new */
/* voices are selected */
/* */
/*************************************************************************/
#include "flite.h"
#include "cst_tokenstream.h"
static const char * const ssml_singlecharsymbols_general = "<>&/\";";
static const char * const ssml_singlecharsymbols_inattr = "=>;/\"";
#define SSML_DEBUG 0
static const char *ts_get_quoted_remainder(cst_tokenstream *ts)
{
const char *q;
q = ts_get_quoted_token(ts,'"','\\');
return q;
}
static cst_features *ssml_get_attributes(cst_tokenstream *ts)
{
cst_features *a = new_features();
const char* name, *val;
const char *fnn,*vnn;
int i=0;
set_charclasses(ts,
ts->p_whitespacesymbols,
ssml_singlecharsymbols_inattr,
ts->p_prepunctuationsymbols,
ts->p_postpunctuationsymbols);
name = ts_get(ts);
while (!cst_streq(">",name))
{
/* I want names and values to be const */
if (i == 0)
{
fnn="_name0"; vnn="_val0";
}
else
{
fnn="_name1"; vnn="_val1";
}
if (cst_streq(name,"/"))
feat_set_string(a,"_type","startend");
else
{
feat_set_string(a,"_type","start");
feat_set_string(a,fnn,name);
if (cst_streq("=",ts_get(ts)))
{
val = ts_get_quoted_remainder(ts);
feat_set_string(a,vnn,val);
}
}
if (ts_eof(ts))
{
fprintf(stderr,"ssml: unexpected EOF\n");
delete_features(a);
return 0;
}
name = ts_get(ts);
i++;
}
set_charclasses(ts,
ts->p_whitespacesymbols,
ssml_singlecharsymbols_general,
ts->p_prepunctuationsymbols,
ts->p_postpunctuationsymbols);
return a;
}
static cst_utterance *ssml_apply_tag(const char *tag,
cst_features *attributes,
cst_utterance *u,
cst_features *word_feats,
cst_features *feats)
{
const char *wavefilename;
const char *vname;
cst_voice *nvoice;
cst_wave *wave;
cst_item *t;
cst_relation *r;
float break_size;
#if SSML_DEBUG
printf("SSML TAG %s\n",tag);
cst_feat_print(stdout,attributes);
printf("...\n");
#endif
if (cst_streq("AUDIO",tag))
{
if ((cst_streq("start",feat_string(attributes,"_type"))) ||
(cst_streq("startend",feat_string(attributes,"_type"))))
{
wavefilename = feat_string(attributes,"_val0");
wave = new_wave();
if (cst_wave_load_riff(wave,wavefilename) == CST_OK_FORMAT)
{
if (cst_streq("start",feat_string(attributes,"_type")))
{
feat_set_string(word_feats,"ssml_comment","1");
}
feat_set(word_feats,"ssml_play_audio",wave_val(wave));
}
else
delete_wave(wave);
return NULL; /* Cause eou */
}
else if (cst_streq("end",feat_string(attributes,"_type")))
{
feat_remove(word_feats,"ssml_comment");
return NULL; /* Cause eou */
}
}
else if (cst_streq("BREAK",tag))
{
if (u &&
((r = utt_relation(u,"Token")) != NULL) &&
((t = relation_tail(r)) != NULL))
{
item_set_string(t,"break","1");
/* cst_feat_print(stdout,attributes); */
if (cst_streq("size",get_param_string(attributes,"_name0","")))
{
break_size=feat_float(attributes,"_val0");
item_set_float(t,"break_size",break_size);
}
}
}
else if (cst_streq("PROSODY",tag))
{
if (cst_streq("start",feat_string(attributes,"_type")))
{
/* Note SSML doesn't do stretch it does reciprical of stretch */
if (cst_streq("rate",get_param_string(attributes,"_name0","")))
feat_set_float(word_feats,"local_duration_stretch",
1.0/feat_float(attributes,"_val0"));
if (cst_streq("rate",get_param_string(attributes,"_name1","")))
feat_set_float(word_feats,"local_duration_stretch",
1.0/feat_float(attributes,"_val1"));
if (cst_streq("volume",get_param_string(attributes,"_name0","")))
feat_set_float(word_feats,"local_gain",
feat_float(attributes,"_val0")/100.0);
if (cst_streq("volume",get_param_string(attributes,"_name1","")))
feat_set_float(word_feats,"local_gain",
feat_float(attributes,"_val1")/100.0);
}
else if (cst_streq("end",feat_string(attributes,"_type")))
{
feat_remove(word_feats,"local_duration_stretch");
feat_remove(word_feats,"local_gain");
}
}
else if (cst_streq("PHONEME",tag))
{
if (cst_streq("start",feat_string(attributes,"_type")))
{
if (cst_streq("ph",get_param_string(attributes,"_name0","")))
{
const char *ph;
ph = feat_string(attributes,"_val0");
feat_set_string(word_feats,"phones",ph);
}
}
else if (cst_streq("end",feat_string(attributes,"_type")))
{
feat_remove(word_feats,"phones");
}
}
else if (cst_streq("SUB",tag))
{
if (cst_streq("start",feat_string(attributes,"_type")))
{
if (cst_streq("alias",get_param_string(attributes,"_name0","")))
{
const char *alias;
alias = feat_string(attributes,"_val0");
feat_set_string(word_feats,"ssml_alias",alias);
}
}
else if (cst_streq("end",feat_string(attributes,"_type")))
{
feat_remove(word_feats,"ssml_alias");
}
}
else if (cst_streq("VOICE",tag))
{
if (cst_streq("start",feat_string(attributes,"_type")))
{
vname = get_param_string(attributes,"_val0","");
nvoice = flite_voice_select(vname);
feat_set(feats,"current_voice",userdata_val(nvoice));
return NULL; /* cause an utterance break */
}
else if (cst_streq("end",feat_string(attributes,"_type")))
{
/* Hmm we should really have a stack of these */
nvoice =
(cst_voice *)val_userdata(feat_val(feats,"default_voice"));
feat_set(feats,"current_voice",userdata_val(nvoice));
return NULL;
}
}
/* do stuff */
/* flag what to do mark or end */
/*
ph set attributes silence all contained tokens
break add to previous token a break marker
audio silence all following tokens (utt break)
insert waveform
*/
return u;
}
static float flite_ssml_to_speech_ts(cst_tokenstream *ts,
cst_voice *voice,
const char *outtype)
{
/* This is a very ugly function, that might be better written with gotos */
/* This just doesn't seem to be properly functions -- perhaps a proper */
/* consumer/producer threaded model might be better here -- but its */
/* not clear. There is so much have-to-be-done-now vs note-for-later */
/* code, that the code is far from clear, and probably not right */
cst_features *ssml_feats, *ssml_word_feats;
cst_features *attributes;
const char *token = "";
char *tag=NULL;
cst_utterance *utt;
cst_relation *tokrel;
int num_tokens;
cst_breakfunc breakfunc = default_utt_break;
cst_uttfunc utt_user_callback = 0;
float durs = 0.0;
cst_item *t;
cst_voice *current_voice;
int ssml_eou = 0;
const cst_wave *wave;
cst_wave *w;
ssml_feats = new_features();
feat_set(ssml_feats,"current_voice",userdata_val(voice));
feat_set(ssml_feats,"default_voice",userdata_val(voice));
ssml_word_feats = new_features();
set_charclasses(ts,
" \t\n\r",
ssml_singlecharsymbols_general,
get_param_string(voice->features,"text_prepunctuation",""),
get_param_string(voice->features,"text_postpunctuation","")
);
if (feat_present(voice->features,"utt_break"))
breakfunc = val_breakfunc(feat_val(voice->features,"utt_break"));
if (feat_present(voice->features,"utt_user_callback"))
utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback"));
/* If its a file to write to, create and save an empty wave file */
/* as we are going to incrementally append to it */
if (!cst_streq(outtype,"play") &&
!cst_streq(outtype,"none") &&
!cst_streq(outtype,"stream"))
{
w = new_wave();
cst_wave_resize(w,0,1);
cst_wave_set_sample_rate(w,16000);
cst_wave_save_riff(w,outtype); /* an empty wave */
delete_wave(w);
}
num_tokens = 0;
utt = new_utterance();
tokrel = utt_relation_create(utt, "Token");
while (!ts_eof(ts) || num_tokens > 0)
{
current_voice =
(cst_voice *)val_userdata(feat_val(ssml_feats,"current_voice"));
/* printf("awb_debug prewhile %d %s\n",ssml_eou,token); */
if (ssml_eou == 0)
token = ts_get(ts);
else
{
if (!cst_streq("<",token))
token = ts_get(ts);
ssml_eou = 0;
}
while ((cst_streq("<",token)) && (ssml_eou == 0))
{ /* A tag -- look ahead and process it to find out how to advance */
tag = cst_upcase(ts_get(ts));
/* printf("awb_debug tag is %s\n",tag); */
if (cst_streq("/",tag)) /* an end tag */
{
cst_free(tag); tag=NULL;
tag = cst_upcase(ts_get(ts));
attributes = ssml_get_attributes(ts);
feat_set_string(attributes,"_type","end");
}
else
attributes = ssml_get_attributes(ts);
token = ts_get(ts); /* skip ">" */
if (ssml_apply_tag(tag,attributes,utt,ssml_word_feats,ssml_feats))
ssml_eou = 0;
else
ssml_eou = 1;
delete_features(attributes);
cst_free(tag); tag=NULL;
}
if ((cst_strlen(token) == 0) ||
(num_tokens > 500) || /* need an upper bound */
(ssml_eou == 1) || /* ssml tag was utterance break */
(relation_head(tokrel) &&
breakfunc(ts,token,tokrel)))
{
/* An end of utt, so synthesize it */
if (utt_user_callback)
utt = (utt_user_callback)(utt);
if (utt)
{
utt = flite_do_synth(utt,current_voice,utt_synth_tokens);
if (feat_present(utt->features,"Interrupted"))
{
delete_utterance(utt); utt = NULL;
break;
}
durs += flite_process_output(utt,outtype,TRUE);
delete_utterance(utt); utt = NULL;
}
else
break;
if (ts_eof(ts)) break;
utt = new_utterance();
tokrel = utt_relation_create(utt, "Token");
num_tokens = 0;
}
if (feat_present(ssml_word_feats,"ssml_play_audio"))
{
wave = val_wave(feat_val(ssml_word_feats,"ssml_play_audio"));
/* Should create an utterances with the waveform in it */
/* Have to stream it if there is streaming */
if (utt) delete_utterance(utt);
utt = utt_synth_wave(copy_wave(wave),current_voice);
if (utt_user_callback)
utt = (utt_user_callback)(utt);
durs += flite_process_output(utt,outtype,TRUE);
delete_utterance(utt); utt = NULL;
utt = new_utterance();
tokrel = utt_relation_create(utt, "Token");
num_tokens = 0;
feat_remove(ssml_word_feats,"ssml_play_audio");
}
else if (!cst_streq("<",token))
{ /* wasn't an ssml tag */
num_tokens++;
t = relation_append(tokrel, NULL);
item_set_string(t,"name",token);
item_set_string(t,"whitespace",ts->whitespace);
item_set_string(t,"prepunctuation",ts->prepunctuation);
item_set_string(t,"punc",ts->postpunctuation);
/* Mark it at the beginning of the token */
item_set_int(t,"file_pos",
ts->file_pos-(1+ /* as we are already on the next char */
cst_strlen(token)+
cst_strlen(ts->prepunctuation)+
cst_strlen(ts->postpunctuation)));
item_set_int(t,"line_number",ts->line_number);
feat_copy_into(ssml_word_feats,item_feats(t));
}
}
delete_utterance(utt);
delete_features(ssml_feats);
delete_features(ssml_word_feats);
return durs;
}
float flite_ssml_file_to_speech(const char *filename,
cst_voice *voice,
const char *outtype)
{
cst_tokenstream *ts;
int fp;
cst_wave *w;
float d;
if ((ts = ts_open(filename,
get_param_string(voice->features,"text_whitespace",NULL),
get_param_string(voice->features,"text_singlecharsymbols",NULL),
get_param_string(voice->features,"text_prepunctuation",NULL),
get_param_string(voice->features,"text_postpunctuation",NULL)))
== NULL)
{
cst_errmsg("failed to open file \"%s\" for ssml reading\n",
filename);
return 1;
}
fp = get_param_int(voice->features,"file_start_position",0);
if (fp > 0)
ts_set_stream_pos(ts,fp);
/* If its a file to write to, create and save an empty wave file */
/* as we are going to incrementally append to it */
if (!cst_streq(outtype,"play") &&
!cst_streq(outtype,"none") &&
!cst_streq(outtype,"stream"))
{
w = new_wave();
cst_wave_resize(w,0,1);
cst_wave_set_sample_rate(w,16000);
cst_wave_save_riff(w,outtype); /* an empty wave */
delete_wave(w);
}
d = flite_ssml_to_speech_ts(ts,voice,outtype);
ts_close(ts);
return d;
}
float flite_ssml_text_to_speech(const char *text,
cst_voice *voice,
const char *outtype)
{
cst_tokenstream *ts;
int fp;
cst_wave *w;
float d;
if ((ts = ts_open_string(text,
get_param_string(voice->features,"text_whitespace",NULL),
get_param_string(voice->features,"text_singlecharsymbols",NULL),
get_param_string(voice->features,"text_prepunctuation",NULL),
get_param_string(voice->features,"text_postpunctuation",NULL)))
== NULL)
{
return 1;
}
fp = get_param_int(voice->features,"file_start_position",0);
if (fp > 0)
ts_set_stream_pos(ts,fp);
/* If its a file to write to, create and save an empty wave file */
/* as we are going to incrementally append to it */
if (!cst_streq(outtype,"play") &&
!cst_streq(outtype,"none") &&
!cst_streq(outtype,"stream"))
{
w = new_wave();
cst_wave_resize(w,0,1);
cst_wave_set_sample_rate(w,16000);
cst_wave_save_riff(w,outtype); /* an empty wave */
delete_wave(w);
}
d = flite_ssml_to_speech_ts(ts,voice,outtype);
ts_close(ts);
return d;
}