| /*************************************************************************/ |
| /* */ |
| /* Language Technologies Institute */ |
| /* Carnegie Mellon University */ |
| /* Copyright (c) 2001-2011 */ |
| /* All Rights Reserved. */ |
| /* */ |
| /* Permission is hereby granted, free of charge, to use and distribute */ |
| /* this software and its documentation without restriction, including */ |
| /* without limitation the rights to use, copy, modify, merge, publish, */ |
| /* distribute, sublicense, and/or sell copies of this work, and to */ |
| /* permit persons to whom this work is furnished to do so, subject to */ |
| /* the following conditions: */ |
| /* 1. The code must retain the above copyright notice, this list of */ |
| /* conditions and the following disclaimer. */ |
| /* 2. Any modifications must be clearly marked as such. */ |
| /* 3. Original authors' names are not deleted. */ |
| /* 4. The authors' names are not used to endorse or promote products */ |
| /* derived from this software without specific prior written */ |
| /* permission. */ |
| /* */ |
| /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
| /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
| /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
| /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
| /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
| /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
| /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
| /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
| /* THIS SOFTWARE. */ |
| /* */ |
| /*************************************************************************/ |
| /* Author: Alan W Black (awb@cs.cmu.edu) */ |
| /* Date: June 2008 */ |
| /*************************************************************************/ |
| /* */ |
| /* SSML support for flite ( http://www.w3.org/TR/speech-synthesis/ ) */ |
| /* */ |
| /* We don't use a full XML parser here for space and availability */ |
| /* reasons, but this is adequate for SSML */ |
| /* This is based on some old SABLE support in flite that never got */ |
| /* completed */ |
| /* */ |
| /* <ssml> </ssml> */ |
| /* <voice ...> </voice> */ |
| /* name or urls for voices */ |
| /* <audio ...> </audio> */ |
| /* <!-- ... --> */ |
| /* <break .../> */ |
| /* <prosody ...> </prosody> rate volume (no pitch yet) */ |
| /* <emphasis ...> </emphasis> */ |
| /* <sub alias="World Wide Web Consortium">W3C</sub> */ |
| /* <phoneme ph="x x x"> </phoneme> */ |
| /* */ |
| /* <...> ignore all others */ |
| /* */ |
| /* Voice call backs (e.g. -pw and -ps) are not transfered when new */ |
| /* voices are selected */ |
| /* */ |
| /*************************************************************************/ |
| |
| #include "flite.h" |
| #include "cst_tokenstream.h" |
| |
| static const char * const ssml_singlecharsymbols_general = "<>&/\";"; |
| static const char * const ssml_singlecharsymbols_inattr = "=>;/\""; |
| |
| #define SSML_DEBUG 0 |
| |
| static const char *ts_get_quoted_remainder(cst_tokenstream *ts) |
| { |
| const char *q; |
| |
| q = ts_get_quoted_token(ts,'"','\\'); |
| |
| return q; |
| } |
| |
| static cst_features *ssml_get_attributes(cst_tokenstream *ts) |
| { |
| cst_features *a = new_features(); |
| const char* name, *val; |
| const char *fnn,*vnn; |
| int i=0; |
| |
| set_charclasses(ts, |
| ts->p_whitespacesymbols, |
| ssml_singlecharsymbols_inattr, |
| ts->p_prepunctuationsymbols, |
| ts->p_postpunctuationsymbols); |
| |
| name = ts_get(ts); |
| while (!cst_streq(">",name)) |
| { |
| /* I want names and values to be const */ |
| if (i == 0) |
| { |
| fnn="_name0"; vnn="_val0"; |
| } |
| else |
| { |
| fnn="_name1"; vnn="_val1"; |
| } |
| if (cst_streq(name,"/")) |
| feat_set_string(a,"_type","startend"); |
| else |
| { |
| feat_set_string(a,"_type","start"); |
| feat_set_string(a,fnn,name); |
| if (cst_streq("=",ts_get(ts))) |
| { |
| val = ts_get_quoted_remainder(ts); |
| feat_set_string(a,vnn,val); |
| } |
| } |
| if (ts_eof(ts)) |
| { |
| fprintf(stderr,"ssml: unexpected EOF\n"); |
| delete_features(a); |
| return 0; |
| } |
| name = ts_get(ts); |
| i++; |
| } |
| |
| set_charclasses(ts, |
| ts->p_whitespacesymbols, |
| ssml_singlecharsymbols_general, |
| ts->p_prepunctuationsymbols, |
| ts->p_postpunctuationsymbols); |
| |
| return a; |
| } |
| |
| static cst_utterance *ssml_apply_tag(const char *tag, |
| cst_features *attributes, |
| cst_utterance *u, |
| cst_features *word_feats, |
| cst_features *feats) |
| { |
| const char *wavefilename; |
| const char *vname; |
| cst_voice *nvoice; |
| cst_wave *wave; |
| cst_item *t; |
| cst_relation *r; |
| float break_size; |
| |
| #if SSML_DEBUG |
| printf("SSML TAG %s\n",tag); |
| cst_feat_print(stdout,attributes); |
| printf("...\n"); |
| #endif |
| |
| if (cst_streq("AUDIO",tag)) |
| { |
| if ((cst_streq("start",feat_string(attributes,"_type"))) || |
| (cst_streq("startend",feat_string(attributes,"_type")))) |
| { |
| wavefilename = feat_string(attributes,"_val0"); |
| wave = new_wave(); |
| if (cst_wave_load_riff(wave,wavefilename) == CST_OK_FORMAT) |
| { |
| if (cst_streq("start",feat_string(attributes,"_type"))) |
| { |
| feat_set_string(word_feats,"ssml_comment","1"); |
| } |
| feat_set(word_feats,"ssml_play_audio",wave_val(wave)); |
| } |
| else |
| delete_wave(wave); |
| return NULL; /* Cause eou */ |
| } |
| else if (cst_streq("end",feat_string(attributes,"_type"))) |
| { |
| feat_remove(word_feats,"ssml_comment"); |
| return NULL; /* Cause eou */ |
| } |
| } |
| else if (cst_streq("BREAK",tag)) |
| { |
| if (u && |
| ((r = utt_relation(u,"Token")) != NULL) && |
| ((t = relation_tail(r)) != NULL)) |
| { |
| item_set_string(t,"break","1"); |
| /* cst_feat_print(stdout,attributes); */ |
| if (cst_streq("size",get_param_string(attributes,"_name0",""))) |
| { |
| break_size=feat_float(attributes,"_val0"); |
| item_set_float(t,"break_size",break_size); |
| } |
| } |
| } |
| else if (cst_streq("PROSODY",tag)) |
| { |
| if (cst_streq("start",feat_string(attributes,"_type"))) |
| { |
| /* Note SSML doesn't do stretch it does reciprical of stretch */ |
| if (cst_streq("rate",get_param_string(attributes,"_name0",""))) |
| feat_set_float(word_feats,"local_duration_stretch", |
| 1.0/feat_float(attributes,"_val0")); |
| if (cst_streq("rate",get_param_string(attributes,"_name1",""))) |
| feat_set_float(word_feats,"local_duration_stretch", |
| 1.0/feat_float(attributes,"_val1")); |
| if (cst_streq("volume",get_param_string(attributes,"_name0",""))) |
| feat_set_float(word_feats,"local_gain", |
| feat_float(attributes,"_val0")/100.0); |
| if (cst_streq("volume",get_param_string(attributes,"_name1",""))) |
| feat_set_float(word_feats,"local_gain", |
| feat_float(attributes,"_val1")/100.0); |
| } |
| else if (cst_streq("end",feat_string(attributes,"_type"))) |
| { |
| feat_remove(word_feats,"local_duration_stretch"); |
| feat_remove(word_feats,"local_gain"); |
| } |
| |
| } |
| else if (cst_streq("PHONEME",tag)) |
| { |
| if (cst_streq("start",feat_string(attributes,"_type"))) |
| { |
| if (cst_streq("ph",get_param_string(attributes,"_name0",""))) |
| { |
| const char *ph; |
| ph = feat_string(attributes,"_val0"); |
| feat_set_string(word_feats,"phones",ph); |
| } |
| } |
| else if (cst_streq("end",feat_string(attributes,"_type"))) |
| { |
| feat_remove(word_feats,"phones"); |
| } |
| |
| } |
| else if (cst_streq("SUB",tag)) |
| { |
| if (cst_streq("start",feat_string(attributes,"_type"))) |
| { |
| if (cst_streq("alias",get_param_string(attributes,"_name0",""))) |
| { |
| const char *alias; |
| alias = feat_string(attributes,"_val0"); |
| feat_set_string(word_feats,"ssml_alias",alias); |
| } |
| } |
| else if (cst_streq("end",feat_string(attributes,"_type"))) |
| { |
| feat_remove(word_feats,"ssml_alias"); |
| } |
| |
| } |
| else if (cst_streq("VOICE",tag)) |
| { |
| if (cst_streq("start",feat_string(attributes,"_type"))) |
| { |
| vname = get_param_string(attributes,"_val0",""); |
| nvoice = flite_voice_select(vname); |
| feat_set(feats,"current_voice",userdata_val(nvoice)); |
| return NULL; /* cause an utterance break */ |
| } |
| else if (cst_streq("end",feat_string(attributes,"_type"))) |
| { |
| /* Hmm we should really have a stack of these */ |
| nvoice = |
| (cst_voice *)val_userdata(feat_val(feats,"default_voice")); |
| feat_set(feats,"current_voice",userdata_val(nvoice)); |
| return NULL; |
| } |
| } |
| |
| /* do stuff */ |
| /* flag what to do mark or end */ |
| /* |
| ph set attributes silence all contained tokens |
| break add to previous token a break marker |
| audio silence all following tokens (utt break) |
| insert waveform |
| |
| */ |
| |
| return u; |
| } |
| |
| static float flite_ssml_to_speech_ts(cst_tokenstream *ts, |
| cst_voice *voice, |
| const char *outtype) |
| { |
| /* This is a very ugly function, that might be better written with gotos */ |
| /* This just doesn't seem to be properly functions -- perhaps a proper */ |
| /* consumer/producer threaded model might be better here -- but its */ |
| /* not clear. There is so much have-to-be-done-now vs note-for-later */ |
| /* code, that the code is far from clear, and probably not right */ |
| cst_features *ssml_feats, *ssml_word_feats; |
| cst_features *attributes; |
| const char *token = ""; |
| char *tag=NULL; |
| cst_utterance *utt; |
| cst_relation *tokrel; |
| int num_tokens; |
| cst_breakfunc breakfunc = default_utt_break; |
| cst_uttfunc utt_user_callback = 0; |
| float durs = 0.0; |
| cst_item *t; |
| cst_voice *current_voice; |
| int ssml_eou = 0; |
| const cst_wave *wave; |
| cst_wave *w; |
| |
| ssml_feats = new_features(); |
| feat_set(ssml_feats,"current_voice",userdata_val(voice)); |
| feat_set(ssml_feats,"default_voice",userdata_val(voice)); |
| ssml_word_feats = new_features(); |
| set_charclasses(ts, |
| " \t\n\r", |
| ssml_singlecharsymbols_general, |
| get_param_string(voice->features,"text_prepunctuation",""), |
| get_param_string(voice->features,"text_postpunctuation","") |
| ); |
| |
| if (feat_present(voice->features,"utt_break")) |
| breakfunc = val_breakfunc(feat_val(voice->features,"utt_break")); |
| |
| if (feat_present(voice->features,"utt_user_callback")) |
| utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback")); |
| |
| /* If its a file to write to, create and save an empty wave file */ |
| /* as we are going to incrementally append to it */ |
| if (!cst_streq(outtype,"play") && |
| !cst_streq(outtype,"none") && |
| !cst_streq(outtype,"stream")) |
| { |
| w = new_wave(); |
| cst_wave_resize(w,0,1); |
| cst_wave_set_sample_rate(w,16000); |
| cst_wave_save_riff(w,outtype); /* an empty wave */ |
| delete_wave(w); |
| } |
| |
| num_tokens = 0; |
| utt = new_utterance(); |
| |
| tokrel = utt_relation_create(utt, "Token"); |
| while (!ts_eof(ts) || num_tokens > 0) |
| { |
| current_voice = |
| (cst_voice *)val_userdata(feat_val(ssml_feats,"current_voice")); |
| /* printf("awb_debug prewhile %d %s\n",ssml_eou,token); */ |
| if (ssml_eou == 0) |
| token = ts_get(ts); |
| else |
| { |
| if (!cst_streq("<",token)) |
| token = ts_get(ts); |
| ssml_eou = 0; |
| } |
| while ((cst_streq("<",token)) && (ssml_eou == 0)) |
| { /* A tag -- look ahead and process it to find out how to advance */ |
| tag = cst_upcase(ts_get(ts)); |
| /* printf("awb_debug tag is %s\n",tag); */ |
| if (cst_streq("/",tag)) /* an end tag */ |
| { |
| cst_free(tag); tag=NULL; |
| tag = cst_upcase(ts_get(ts)); |
| attributes = ssml_get_attributes(ts); |
| feat_set_string(attributes,"_type","end"); |
| } |
| else |
| attributes = ssml_get_attributes(ts); |
| token = ts_get(ts); /* skip ">" */ |
| if (ssml_apply_tag(tag,attributes,utt,ssml_word_feats,ssml_feats)) |
| ssml_eou = 0; |
| else |
| ssml_eou = 1; |
| |
| delete_features(attributes); |
| cst_free(tag); tag=NULL; |
| } |
| |
| if ((cst_strlen(token) == 0) || |
| (num_tokens > 500) || /* need an upper bound */ |
| (ssml_eou == 1) || /* ssml tag was utterance break */ |
| (relation_head(tokrel) && |
| breakfunc(ts,token,tokrel))) |
| { |
| /* An end of utt, so synthesize it */ |
| if (utt_user_callback) |
| utt = (utt_user_callback)(utt); |
| |
| if (utt) |
| { |
| utt = flite_do_synth(utt,current_voice,utt_synth_tokens); |
| if (feat_present(utt->features,"Interrupted")) |
| { |
| delete_utterance(utt); utt = NULL; |
| break; |
| } |
| durs += flite_process_output(utt,outtype,TRUE); |
| delete_utterance(utt); utt = NULL; |
| } |
| else |
| break; |
| |
| if (ts_eof(ts)) break; |
| |
| utt = new_utterance(); |
| tokrel = utt_relation_create(utt, "Token"); |
| num_tokens = 0; |
| } |
| |
| if (feat_present(ssml_word_feats,"ssml_play_audio")) |
| { |
| wave = val_wave(feat_val(ssml_word_feats,"ssml_play_audio")); |
| /* Should create an utterances with the waveform in it */ |
| /* Have to stream it if there is streaming */ |
| if (utt) delete_utterance(utt); |
| utt = utt_synth_wave(copy_wave(wave),current_voice); |
| if (utt_user_callback) |
| utt = (utt_user_callback)(utt); |
| durs += flite_process_output(utt,outtype,TRUE); |
| delete_utterance(utt); utt = NULL; |
| |
| utt = new_utterance(); |
| tokrel = utt_relation_create(utt, "Token"); |
| num_tokens = 0; |
| |
| feat_remove(ssml_word_feats,"ssml_play_audio"); |
| } |
| else if (!cst_streq("<",token)) |
| { /* wasn't an ssml tag */ |
| num_tokens++; |
| |
| t = relation_append(tokrel, NULL); |
| item_set_string(t,"name",token); |
| item_set_string(t,"whitespace",ts->whitespace); |
| item_set_string(t,"prepunctuation",ts->prepunctuation); |
| item_set_string(t,"punc",ts->postpunctuation); |
| /* Mark it at the beginning of the token */ |
| item_set_int(t,"file_pos", |
| ts->file_pos-(1+ /* as we are already on the next char */ |
| cst_strlen(token)+ |
| cst_strlen(ts->prepunctuation)+ |
| cst_strlen(ts->postpunctuation))); |
| item_set_int(t,"line_number",ts->line_number); |
| feat_copy_into(ssml_word_feats,item_feats(t)); |
| } |
| } |
| |
| delete_utterance(utt); |
| delete_features(ssml_feats); |
| delete_features(ssml_word_feats); |
| return durs; |
| } |
| |
| float flite_ssml_file_to_speech(const char *filename, |
| cst_voice *voice, |
| const char *outtype) |
| { |
| cst_tokenstream *ts; |
| int fp; |
| cst_wave *w; |
| float d; |
| |
| if ((ts = ts_open(filename, |
| get_param_string(voice->features,"text_whitespace",NULL), |
| get_param_string(voice->features,"text_singlecharsymbols",NULL), |
| get_param_string(voice->features,"text_prepunctuation",NULL), |
| get_param_string(voice->features,"text_postpunctuation",NULL))) |
| == NULL) |
| { |
| cst_errmsg("failed to open file \"%s\" for ssml reading\n", |
| filename); |
| return 1; |
| } |
| fp = get_param_int(voice->features,"file_start_position",0); |
| if (fp > 0) |
| ts_set_stream_pos(ts,fp); |
| |
| /* If its a file to write to, create and save an empty wave file */ |
| /* as we are going to incrementally append to it */ |
| if (!cst_streq(outtype,"play") && |
| !cst_streq(outtype,"none") && |
| !cst_streq(outtype,"stream")) |
| { |
| w = new_wave(); |
| cst_wave_resize(w,0,1); |
| cst_wave_set_sample_rate(w,16000); |
| cst_wave_save_riff(w,outtype); /* an empty wave */ |
| delete_wave(w); |
| } |
| |
| d = flite_ssml_to_speech_ts(ts,voice,outtype); |
| |
| ts_close(ts); |
| |
| return d; |
| |
| } |
| |
| float flite_ssml_text_to_speech(const char *text, |
| cst_voice *voice, |
| const char *outtype) |
| { |
| cst_tokenstream *ts; |
| int fp; |
| cst_wave *w; |
| float d; |
| |
| if ((ts = ts_open_string(text, |
| get_param_string(voice->features,"text_whitespace",NULL), |
| get_param_string(voice->features,"text_singlecharsymbols",NULL), |
| get_param_string(voice->features,"text_prepunctuation",NULL), |
| get_param_string(voice->features,"text_postpunctuation",NULL))) |
| == NULL) |
| { |
| return 1; |
| } |
| fp = get_param_int(voice->features,"file_start_position",0); |
| if (fp > 0) |
| ts_set_stream_pos(ts,fp); |
| |
| /* If its a file to write to, create and save an empty wave file */ |
| /* as we are going to incrementally append to it */ |
| if (!cst_streq(outtype,"play") && |
| !cst_streq(outtype,"none") && |
| !cst_streq(outtype,"stream")) |
| { |
| w = new_wave(); |
| cst_wave_resize(w,0,1); |
| cst_wave_set_sample_rate(w,16000); |
| cst_wave_save_riff(w,outtype); /* an empty wave */ |
| delete_wave(w); |
| } |
| |
| d = flite_ssml_to_speech_ts(ts,voice,outtype); |
| |
| ts_close(ts); |
| |
| return d; |
| |
| } |
| |
| |