blob: 1b17e6eb62e6cba5d2700a9f5063deb0f95351bc [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 2001 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: January 2001 */
/*************************************************************************/
/* */
/* General unit functions (diphones or clunit) */
/* */
/*************************************************************************/
#include "cst_math.h"
#include "cst_hrg.h"
#include "cst_utt_utils.h"
#include "cst_wave.h"
#include "cst_track.h"
#include "cst_units.h"
#include "cst_sigpr.h"
static int nearest_pm(cst_sts_list *sts_list,int start,int end,float u_index);
cst_utterance *join_units(cst_utterance *utt)
{
/* Make a waveform form the units */
const char *join_type;
join_type = get_param_string(utt->features,"join_type", "modified_lpc");
if (cst_streq(join_type,"none"))
return utt;
#if 0
else if (cst_streq(join_type,"windowed_join"))
join_units_windowed(utt);
#endif
else if (cst_streq(join_type,"simple_join"))
join_units_simple(utt);
else if (cst_streq(join_type,"modified_lpc"))
join_units_modified_lpc(utt);
return utt;
}
cst_utterance *join_units_simple(cst_utterance *utt)
{
cst_wave *w = 0;
cst_lpcres *lpcres;
const char *resynth_type;
const cst_val *streaming_info_val;
resynth_type = get_param_string(utt->features,"resynth_type", "fixed");
asis_to_pm(utt);
concat_units(utt);
lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
if (streaming_info_val)
{
lpcres->asi = val_audio_streaming_info(streaming_info_val);
lpcres->asi->utt = utt;
}
if (cst_streq(resynth_type, "fixed"))
w = lpc_resynth_fixedpoint(lpcres);
else
{
cst_errmsg("unknown resynthesis type %s\n", resynth_type);
cst_error(); /* Should not happen */
}
utt_set_wave(utt,w);
return utt;
}
cst_utterance *join_units_modified_lpc(cst_utterance *utt)
{
cst_wave *w = 0;
cst_lpcres *lpcres;
const char *resynth_type;
const cst_val *streaming_info_val;
resynth_type = get_param_string(utt->features,"resynth_type", "float");
f0_targets_to_pm(utt);
concat_units(utt);
lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
streaming_info_val=get_param_val(utt->features,"streaming_info",NULL);
if (streaming_info_val)
{
lpcres->asi = val_audio_streaming_info(streaming_info_val);
lpcres->asi->utt = utt;
}
if (cst_streq(resynth_type, "float"))
w = lpc_resynth(lpcres);
else if (cst_streq(resynth_type, "fixed"))
{
w = lpc_resynth_fixedpoint(lpcres);
}
else
{
cst_errmsg("unknown resynthesis type %s\n", resynth_type);
cst_error(); /* Should not happen */
}
if (w == NULL)
{
/* Synthesis Failed, probably because it was interrupted */
utt_set_feat_int(utt,"Interrupted",1);
w = new_wave();
}
utt_set_wave(utt,w);
return utt;
}
cst_utterance *asis_to_pm(cst_utterance *utt)
{
/* Copy the PM structure from the units unchanged */
cst_item *u;
cst_lpcres *target_lpcres;
int unit_start, unit_end;
int utt_pms, utt_size, i;
cst_sts_list *sts_list;
sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
target_lpcres = new_lpcres();
/* Pass one to find the size */
utt_pms = utt_size = 0;
for (u=relation_head(utt_relation(utt,"Unit"));
u;
u=item_next(u))
{
unit_start = item_feat_int(u,"unit_start");
unit_end = item_feat_int(u,"unit_end");
utt_size += get_unit_size(sts_list,unit_start,unit_end);
utt_pms += unit_end - unit_start;
item_set_int(u,"target_end",utt_size);
}
lpcres_resize_frames(target_lpcres,utt_pms);
/* Pass two to fill in the values */
utt_pms = utt_size = 0;
for (u=relation_head(utt_relation(utt,"Unit"));
u;
u=item_next(u))
{
unit_start = item_feat_int(u,"unit_start");
unit_end = item_feat_int(u,"unit_end");
for (i=unit_start; i<unit_end; i++,utt_pms++)
{
utt_size += get_frame_size(sts_list, i);
target_lpcres->times[utt_pms] = utt_size;
}
}
utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
return utt;
}
cst_utterance *f0_targets_to_pm(cst_utterance *utt)
{
cst_item *t;
float pos,lpos,f0,lf0,m;
double time;
int pm;
cst_sts_list *sts_list;
cst_lpcres *target_lpcres;
sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
lpos = 0;
lf0 = 120; /* hmm */
pm = 0;
time = 0;
/* First pass to count how many pms will be required */
for (t=relation_head(utt_relation(utt,"Target"));
t;
t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
{
pos = item_feat_float(t,"pos");
f0 = item_feat_float(t,"f0");
if (time == pos) continue;
m = (f0-lf0)/(pos-lpos);
for ( ; time < pos; pm++)
{
time += 1/(lf0 + ((time-lpos)*m));
}
}
target_lpcres = new_lpcres();
lpcres_resize_frames(target_lpcres,pm);
lpos = 0;
lf0 = 120;
pm = 0;
time = 0;
/* Second pass puts the values in */
for (t=relation_head(utt_relation(utt,"Target"));
t;
t=item_next(t), lf0 = f0, lpos = pos) /* changed by dhopkins */
{
pos = item_feat_float(t,"pos");
f0 = item_feat_float(t,"f0");
if (time == pos) continue;
m = (f0-lf0)/(pos-lpos);
for ( ; time < pos; pm++)
{
time += 1/(lf0 + ((time-lpos)*m));
target_lpcres->times[pm] = sts_list->sample_rate * time;
}
}
utt_set_feat(utt,"target_lpcres",lpcres_val(target_lpcres));
return utt;
}
cst_utterance *concat_units(cst_utterance *utt)
{
cst_lpcres *target_lpcres;
cst_item *u;
int pm_i;
int unit_size, unit_start, unit_end;
int rpos, nearest_u_pm;
int target_end, target_start;
float m, u_index;
cst_sts_list *sts_list;
const char *residual_type;
sts_list = val_sts_list(utt_feat_val(utt,"sts_list"));
if (sts_list->codec == NULL)
residual_type = "ulaw";
else
residual_type = sts_list->codec;
target_lpcres = val_lpcres(utt_feat_val(utt,"target_lpcres"));
target_lpcres->lpc_min = sts_list->coeff_min;
target_lpcres->lpc_range = sts_list->coeff_range;
target_lpcres->num_channels = sts_list->num_channels;
target_lpcres->sample_rate = sts_list->sample_rate;
lpcres_resize_samples(target_lpcres,
target_lpcres->times[target_lpcres->num_frames-1]);
if (utt_feat_val(utt,"delayed_decoding"))
{
target_lpcres->delayed_decoding = 1;
target_lpcres->packed_residuals =
cst_alloc(const unsigned char *,target_lpcres->num_frames);
}
target_start = 0.0; rpos = 0; pm_i = 0; u_index = 0;
for (u=relation_head(utt_relation(utt,"Unit")); u; u=item_next(u))
{
unit_start = item_feat_int(u,"unit_start");
unit_end = item_feat_int(u,"unit_end");
unit_size = get_unit_size(sts_list,unit_start,unit_end);
target_end = item_feat_int(u,"target_end");
u_index = 0;
m = (float)unit_size/(float)(target_end-target_start);
/* printf("unit_size %d start %d end %d tstart %d tend %d m %f\n",
unit_size, unit_start, unit_end, target_start, target_end, m); */
for ( /* pm_start=pm_i */ ;
(pm_i < target_lpcres->num_frames) &&
(target_lpcres->times[pm_i] <= target_end);
pm_i++)
{
nearest_u_pm = nearest_pm(sts_list,unit_start,unit_end,u_index);
/* Get LPC coefs (pointer) */
target_lpcres->frames[pm_i] = get_sts_frame(sts_list, nearest_u_pm);
/* Get residual (copy) */
target_lpcres->sizes[pm_i] =
target_lpcres->times[pm_i] -
(pm_i > 0 ? target_lpcres->times[pm_i-1] : 0);
if (cst_streq(residual_type,"pulse"))
add_residual_pulse(target_lpcres->sizes[pm_i],
&target_lpcres->residual[rpos],
get_frame_size(sts_list, nearest_u_pm),
get_sts_residual(sts_list, nearest_u_pm));
else if (cst_streq(residual_type,"g721"))
add_residual_g721(target_lpcres->sizes[pm_i],
&target_lpcres->residual[rpos],
get_frame_size(sts_list, nearest_u_pm),
get_sts_residual(sts_list, nearest_u_pm));
else if (cst_streq(residual_type,"g721vuv"))
{
if (target_lpcres->delayed_decoding)
{
target_lpcres->packed_residuals[pm_i] =
get_sts_residual(sts_list, nearest_u_pm);
}
else
{
add_residual_g721vuv(target_lpcres->sizes[pm_i],
&target_lpcres->residual[rpos],
get_frame_size(sts_list, nearest_u_pm),
get_sts_residual(sts_list, nearest_u_pm));
}
}
else if (cst_streq(residual_type,"vuv"))
add_residual_vuv(target_lpcres->sizes[pm_i],
&target_lpcres->residual[rpos],
get_frame_size(sts_list, nearest_u_pm),
get_sts_residual(sts_list, nearest_u_pm));
/* But this requires particular layout of residuals which
probably isn't true */
/*
if (cst_streq(residual_type,"windowed"))
add_residual_windowed(target_lpcres->sizes[pm_i],
&target_lpcres->residual[rpos],
get_frame_size(sts_list, nearest_u_pm),
get_sts_residual(sts_list, nearest_u_pm));
*/
else /* default is "ulaw" */
add_residual(target_lpcres->sizes[pm_i],
&target_lpcres->residual[rpos],
get_frame_size(sts_list, nearest_u_pm),
get_sts_residual(sts_list, nearest_u_pm));
rpos+=target_lpcres->sizes[pm_i];
u_index += (float)target_lpcres->sizes[pm_i]*m;
}
target_start = target_end;
}
target_lpcres->num_frames = pm_i;
return utt;
}
static int nearest_pm(cst_sts_list *sts_list, int start,int end,float u_index)
{
/* First the pm in unit_entry that is closest to u_index */
int i, i_size, n_size;
i_size = 0;
for (i=start; i < end; i++)
{
n_size = i_size + get_frame_size(sts_list, i);
if (fabs((double)(u_index-(float)i_size)) <
fabs((double)(u_index-(float)n_size)))
return i;
i_size = n_size;
}
return end-1;
}
#if 0
void add_residual_windowed(int targ_size,
unsigned char *targ_residual,
int unit_size,
const unsigned char *unit_residual)
{
/* Note this doesn't work unless the unit_residuals and consecutive */
#define DI_PI 3.14159265358979323846
float *window, *unit, *residual;
int i,j,k, offset, win_size;
win_size = (targ_size*2)+1;
window = cst_alloc(float,win_size);
window[targ_size+1] = 1.0;
k = DI_PI / (win_size - 1);
for (i=0,j=win_size-1; i < targ_size+1; i++,j--)
window[j] = window[i] = 0.54 - (0.46 * cos(k * i));
residual = cst_alloc(float,win_size);
for (i=0; i<win_size; i++)
residual[i] = cst_ulaw_to_short(targ_residual[i]);
unit = cst_alloc(float,(unit_size*2)+1);
for (i=0; i<(unit_size*2)+1; i++)
unit[i] = cst_ulaw_to_short(unit_residual[i]);
if (targ_size < unit_size)
for (i=0; i < win_size; i++)
residual[i] += window[i] * unit[i+(unit_size-targ_size)/2];
else
{
offset = (targ_size-unit_size)/2;
for (i=offset; i < win_size-offset; i++)
residual[i] += window[i] * unit[i-offset];
}
for (i=0; i < win_size; i++)
targ_residual[i] = cst_short_to_ulaw((short)residual[i]);
cst_free(window);
cst_free(residual);
cst_free(unit);
}
#endif
void add_residual(int targ_size, unsigned char *targ_residual,
int unit_size, const unsigned char *unit_residual)
{
/* float pow_factor;
int i; */
if (unit_size < targ_size)
memmove(&targ_residual[((targ_size-unit_size)/2)],
&unit_residual[0],
unit_size*sizeof(unsigned char));
else
{
memmove(&targ_residual[0],
&unit_residual[((unit_size-targ_size)/2)],
targ_size*sizeof(unsigned char));
}
#if 0
if (unit_size < targ_size)
memmove(&targ_residual[0],
&unit_residual[0],
unit_size*sizeof(unsigned char));
else
{
memmove(&targ_residual[0],
&unit_residual[0],
targ_size*sizeof(unsigned char));
}
#endif
}
void add_residual_g721(int targ_size, unsigned char *targ_residual,
int uunit_size, const unsigned char *unit_residual)
{
/* Residual is encoded with g721 */
unsigned char *unit_residual_unpacked;
int unit_size;
unit_residual_unpacked =
cst_g721_decode(&unit_size, (uunit_size+CST_G721_LEADIN+1)/2, unit_residual);
if (uunit_size < targ_size)
memmove(&targ_residual[((targ_size-uunit_size)/2)],
&unit_residual_unpacked[CST_G721_LEADIN],
uunit_size*sizeof(unsigned char));
else
{
memmove(&targ_residual[0],
&unit_residual_unpacked[CST_G721_LEADIN+((uunit_size-targ_size)/2)],
targ_size*sizeof(unsigned char));
}
cst_free(unit_residual_unpacked);
}
static double plus_or_minus_one()
{
/* Randomly return 1 or -1 */
/* not sure rand() is portable */
if (rand() > RAND_MAX/2.0)
return 1.0;
else
return -1.0;
}
static double rand_zero_to_one()
{
/* Return number between 0.0 and 1.0 */
return rand()/(float)RAND_MAX;
}
void add_residual_g721vuv(int targ_size, unsigned char *targ_residual,
int uunit_size, const unsigned char *unit_residual)
{
/* Residual is encoded with g721 */
unsigned char *unit_residual_unpacked;
int p, j;
float m, q;
int unit_size;
int offset;
if (unit_residual[0] == 0)
{
unit_size = uunit_size;
unit_residual_unpacked = cst_alloc(unsigned char,unit_size);
p = unit_residual[4]; p = p << 8;
p += unit_residual[3]; p = p << 8;
p += unit_residual[2]; p = p << 8;
p += unit_residual[1];
m = ((float)p);
for (j=0; j<unit_size; j++)
{
q = m*2*rand_zero_to_one()*plus_or_minus_one();
unit_residual_unpacked[j] = cst_short_to_ulaw((short)q);
}
offset = 0;
}
else
{
unit_residual_unpacked =
cst_g721_decode(&unit_size, (uunit_size+CST_G721_LEADIN+1)/2, unit_residual);
offset = CST_G721_LEADIN;
}
if (uunit_size < targ_size)
memmove(&targ_residual[((targ_size-uunit_size)/2)],
&unit_residual_unpacked[offset],
uunit_size*sizeof(unsigned char));
else
{
memmove(&targ_residual[0],
&unit_residual_unpacked[offset+((uunit_size-targ_size)/2)],
targ_size*sizeof(unsigned char));
}
cst_free(unit_residual_unpacked);
}
void add_residual_vuv(int targ_size, unsigned char *targ_residual,
int uunit_size, const unsigned char *unit_residual)
{
/* Residual is encoded with vuv */
unsigned char *unit_residual_unpacked;
int p, j;
float m, q;
int unit_size;
if (unit_residual[0] == 0)
{
unit_size = uunit_size;
unit_residual_unpacked = cst_alloc(unsigned char,unit_size);
p = unit_residual[4]; p = p << 8;
p += unit_residual[3]; p = p << 8;
p += unit_residual[2]; p = p << 8;
p += unit_residual[1];
m = ((float)p);
for (j=0; j<unit_size; j++)
{
q = m*2*rand_zero_to_one()*plus_or_minus_one();
unit_residual_unpacked[j] = cst_short_to_ulaw((short)q);
}
}
else
{
/* Put in to the unpacked -- with no unpacking */
/* The cast is because unit_residual is const, and can't be deleted */
unit_residual_unpacked = (unsigned char *)(void *)unit_residual;
}
if (uunit_size < targ_size)
memmove(&targ_residual[((targ_size-uunit_size)/2)],
&unit_residual_unpacked[0],
uunit_size*sizeof(unsigned char));
else
{
memmove(&targ_residual[0],
&unit_residual_unpacked[((uunit_size-targ_size)/2)],
targ_size*sizeof(unsigned char));
}
if (unit_residual[0] == 0)
cst_free(unit_residual_unpacked);
}
void add_residual_pulse(int targ_size, unsigned char *targ_residual,
int unit_size, const unsigned char *unit_residual)
{
int p,i,m;
/* Unit residual isn't a pointer its a number, the power for the
the sts, yes this is hackily casting the address to a number */
/* Need voiced and unvoiced model */
p = (int)unit_residual; /* I know the compiler will complain about this */
if (p > 7000) /* voiced */
{
i = ((targ_size-unit_size)/2);
targ_residual[i-2] = cst_short_to_ulaw((short)(p/4));
targ_residual[i] = cst_short_to_ulaw((short)(p/2));
targ_residual[i+2] = cst_short_to_ulaw((short)(p/4));
}
else /* unvoiced */
{
m = p / targ_size;
for (i=0; i<targ_size; i++)
targ_residual[i] =
cst_short_to_ulaw((short)(m*plus_or_minus_one()));
}
#if 0
if (unit_size < targ_size)
targ_residual[((targ_size-unit_size)/2)]
= cst_short_to_ulaw((short)(int)unit_residual);
else
targ_residual[((unit_size-targ_size)/2)]
= cst_short_to_ulaw((short)(int)unit_residual);
#endif
}