blob: 866a69c6b41a8c8102191835378d8abdf2d8c8cb [file] [log] [blame]
/*
** Copyright 2003-2010, VisualOn, Inc.
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/
/***********************************************************************
* File: wb_vad.c *
* *
* Description: Voice Activity Detection *
* *
************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "cnst.h"
#include "wb_vad.h"
#include "typedef.h"
#include "basic_op.h"
#include "math_op.h"
#include "wb_vad_c.h"
#include "mem_align.h"
/******************************************************************************
* Calculate Log2 and scale the signal:
*
* ilog2(Word32 in) = -1024*log10(in * 2^-31)/log10(2), where in = [1, 2^31-1]
*
* input output
* 32768 16384
* 1 31744
*
* When input is in the range of [1,2^16], max error is 0.0380%.
*********************************************************************************/
static Word16 ilog2( /* return: output value of the log2 */
Word16 mant /* i: value to be converted */
)
{
Word16 ex, ex2, res;
Word32 i, l_temp;
if (mant <= 0)
{
mant = 1;
}
ex = norm_s(mant);
mant = mant << ex;
for (i = 0; i < 3; i++)
mant = vo_mult(mant, mant);
l_temp = vo_L_mult(mant, mant);
ex2 = norm_l(l_temp);
mant = extract_h(l_temp << ex2);
res = (ex + 16) << 10;
res = add1(res, (ex2 << 6));
res = vo_sub(add1(res, 127), (mant >> 8));
return (res);
}
/******************************************************************************
*
* Function : filter5
* Purpose : Fifth-order half-band lowpass/highpass filter pair with
* decimation.
*
*******************************************************************************/
static void filter5(
Word16 * in0, /* i/o : input values; output low-pass part */
Word16 * in1, /* i/o : input values; output high-pass part */
Word16 data[] /* i/o : filter memory */
)
{
Word16 temp0, temp1, temp2;
temp0 = vo_sub(*in0, vo_mult(COEFF5_1, data[0]));
temp1 = add1(data[0], vo_mult(COEFF5_1, temp0));
data[0] = temp0;
temp0 = vo_sub(*in1, vo_mult(COEFF5_2, data[1]));
temp2 = add1(data[1], vo_mult(COEFF5_2, temp0));
data[1] = temp0;
*in0 = extract_h((vo_L_add(temp1, temp2) << 15));
*in1 = extract_h((vo_L_sub(temp1, temp2) << 15));
}
/******************************************************************************
*
* Function : filter3
* Purpose : Third-order half-band lowpass/highpass filter pair with
* decimation.
*
*******************************************************************************/
static void filter3(
Word16 * in0, /* i/o : input values; output low-pass part */
Word16 * in1, /* i/o : input values; output high-pass part */
Word16 * data /* i/o : filter memory */
)
{
Word16 temp1, temp2;
temp1 = vo_sub(*in1, vo_mult(COEFF3, *data));
temp2 = add1(*data, vo_mult(COEFF3, temp1));
*data = temp1;
*in1 = extract_h((vo_L_sub(*in0, temp2) << 15));
*in0 = extract_h((vo_L_add(*in0, temp2) << 15));
}
/******************************************************************************
*
* Function : level_calculation
* Purpose : Calculate signal level in a sub-band. Level is calculated
* by summing absolute values of the input data.
*
* Signal level calculated from of the end of the frame
* (data[count1 - count2]) is stored to (*sub_level)
* and added to the level of the next frame.
*
******************************************************************************/
static Word16 level_calculation( /* return: signal level */
Word16 data[], /* i : signal buffer */
Word16 * sub_level, /* i : level calculated at the end of the previous frame*/
/* o : level of signal calculated from the last */
/* (count2 - count1) samples */
Word16 count1, /* i : number of samples to be counted */
Word16 count2, /* i : number of samples to be counted */
Word16 ind_m, /* i : step size for the index of the data buffer */
Word16 ind_a, /* i : starting index of the data buffer */
Word16 scale /* i : scaling for the level calculation */
)
{
Word32 i, l_temp1, l_temp2;
Word16 level;
l_temp1 = 0L;
for (i = count1; i < count2; i++)
{
l_temp1 += (abs_s(data[ind_m * i + ind_a])<<1);
}
l_temp2 = vo_L_add(l_temp1, L_shl(*sub_level, 16 - scale));
*sub_level = extract_h(L_shl(l_temp1, scale));
for (i = 0; i < count1; i++)
{
l_temp2 += (abs_s(data[ind_m * i + ind_a])<<1);
}
level = extract_h(L_shl2(l_temp2, scale));
return level;
}
/******************************************************************************
*
* Function : filter_bank
* Purpose : Divide input signal into bands and calculate level of
* the signal in each band
*
*******************************************************************************/
static void filter_bank(
VadVars * st, /* i/o : State struct */
Word16 in[], /* i : input frame */
Word16 level[] /* o : signal levels at each band */
)
{
Word32 i;
Word16 tmp_buf[FRAME_LEN];
/* shift input 1 bit down for safe scaling */
for (i = 0; i < FRAME_LEN; i++)
{
tmp_buf[i] = in[i] >> 1;
}
/* run the filter bank */
for (i = 0; i < 128; i++)
{
filter5(&tmp_buf[2 * i], &tmp_buf[2 * i + 1], st->a_data5[0]);
}
for (i = 0; i < 64; i++)
{
filter5(&tmp_buf[4 * i], &tmp_buf[4 * i + 2], st->a_data5[1]);
filter5(&tmp_buf[4 * i + 1], &tmp_buf[4 * i + 3], st->a_data5[2]);
}
for (i = 0; i < 32; i++)
{
filter5(&tmp_buf[8 * i], &tmp_buf[8 * i + 4], st->a_data5[3]);
filter5(&tmp_buf[8 * i + 2], &tmp_buf[8 * i + 6], st->a_data5[4]);
filter3(&tmp_buf[8 * i + 3], &tmp_buf[8 * i + 7], &st->a_data3[0]);
}
for (i = 0; i < 16; i++)
{
filter3(&tmp_buf[16 * i + 0], &tmp_buf[16 * i + 8], &st->a_data3[1]);
filter3(&tmp_buf[16 * i + 4], &tmp_buf[16 * i + 12], &st->a_data3[2]);
filter3(&tmp_buf[16 * i + 6], &tmp_buf[16 * i + 14], &st->a_data3[3]);
}
for (i = 0; i < 8; i++)
{
filter3(&tmp_buf[32 * i + 0], &tmp_buf[32 * i + 16], &st->a_data3[4]);
filter3(&tmp_buf[32 * i + 8], &tmp_buf[32 * i + 24], &st->a_data3[5]);
}
/* calculate levels in each frequency band */
/* 4800 - 6400 Hz */
level[11] = level_calculation(tmp_buf, &st->sub_level[11], 16, 64, 4, 1, 14);
/* 4000 - 4800 Hz */
level[10] = level_calculation(tmp_buf, &st->sub_level[10], 8, 32, 8, 7, 15);
/* 3200 - 4000 Hz */
level[9] = level_calculation(tmp_buf, &st->sub_level[9],8, 32, 8, 3, 15);
/* 2400 - 3200 Hz */
level[8] = level_calculation(tmp_buf, &st->sub_level[8],8, 32, 8, 2, 15);
/* 2000 - 2400 Hz */
level[7] = level_calculation(tmp_buf, &st->sub_level[7],4, 16, 16, 14, 16);
/* 1600 - 2000 Hz */
level[6] = level_calculation(tmp_buf, &st->sub_level[6],4, 16, 16, 6, 16);
/* 1200 - 1600 Hz */
level[5] = level_calculation(tmp_buf, &st->sub_level[5],4, 16, 16, 4, 16);
/* 800 - 1200 Hz */
level[4] = level_calculation(tmp_buf, &st->sub_level[4],4, 16, 16, 12, 16);
/* 600 - 800 Hz */
level[3] = level_calculation(tmp_buf, &st->sub_level[3],2, 8, 32, 8, 17);
/* 400 - 600 Hz */
level[2] = level_calculation(tmp_buf, &st->sub_level[2],2, 8, 32, 24, 17);
/* 200 - 400 Hz */
level[1] = level_calculation(tmp_buf, &st->sub_level[1],2, 8, 32, 16, 17);
/* 0 - 200 Hz */
level[0] = level_calculation(tmp_buf, &st->sub_level[0],2, 8, 32, 0, 17);
}
/******************************************************************************
*
* Function : update_cntrl
* Purpose : Control update of the background noise estimate.
*
*******************************************************************************/
static void update_cntrl(
VadVars * st, /* i/o : State structure */
Word16 level[] /* i : sub-band levels of the input frame */
)
{
Word32 i;
Word16 num, temp, stat_rat, exp, denom;
Word16 alpha;
/* if a tone has been detected for a while, initialize stat_count */
if (sub((Word16) (st->tone_flag & 0x7c00), 0x7c00) == 0)
{
st->stat_count = STAT_COUNT;
} else
{
/* if 8 last vad-decisions have been "0", reinitialize stat_count */
if ((st->vadreg & 0x7f80) == 0)
{
st->stat_count = STAT_COUNT;
} else
{
stat_rat = 0;
for (i = 0; i < COMPLEN; i++)
{
if(level[i] > st->ave_level[i])
{
num = level[i];
denom = st->ave_level[i];
} else
{
num = st->ave_level[i];
denom = level[i];
}
/* Limit nimimum value of num and denom to STAT_THR_LEVEL */
if(num < STAT_THR_LEVEL)
{
num = STAT_THR_LEVEL;
}
if(denom < STAT_THR_LEVEL)
{
denom = STAT_THR_LEVEL;
}
exp = norm_s(denom);
denom = denom << exp;
/* stat_rat = num/denom * 64 */
temp = div_s(num >> 1, denom);
stat_rat = add1(stat_rat, shr(temp, (8 - exp)));
}
/* compare stat_rat with a threshold and update stat_count */
if(stat_rat > STAT_THR)
{
st->stat_count = STAT_COUNT;
} else
{
if ((st->vadreg & 0x4000) != 0)
{
if (st->stat_count != 0)
{
st->stat_count = st->stat_count - 1;
}
}
}
}
}
/* Update average amplitude estimate for stationarity estimation */
alpha = ALPHA4;
if(st->stat_count == STAT_COUNT)
{
alpha = 32767;
} else if ((st->vadreg & 0x4000) == 0)
{
alpha = ALPHA5;
}
for (i = 0; i < COMPLEN; i++)
{
st->ave_level[i] = add1(st->ave_level[i], vo_mult_r(alpha, vo_sub(level[i], st->ave_level[i])));
}
}
/******************************************************************************
*
* Function : hangover_addition
* Purpose : Add hangover after speech bursts
*
*******************************************************************************/
static Word16 hangover_addition( /* return: VAD_flag indicating final VAD decision */
VadVars * st, /* i/o : State structure */
Word16 low_power, /* i : flag power of the input frame */
Word16 hang_len, /* i : hangover length */
Word16 burst_len /* i : minimum burst length for hangover addition */
)
{
/* if the input power (pow_sum) is lower than a threshold, clear counters and set VAD_flag to "0" */
if (low_power != 0)
{
st->burst_count = 0;
st->hang_count = 0;
return 0;
}
/* update the counters (hang_count, burst_count) */
if ((st->vadreg & 0x4000) != 0)
{
st->burst_count = st->burst_count + 1;
if(st->burst_count >= burst_len)
{
st->hang_count = hang_len;
}
return 1;
} else
{
st->burst_count = 0;
if (st->hang_count > 0)
{
st->hang_count = st->hang_count - 1;
return 1;
}
}
return 0;
}
/******************************************************************************
*
* Function : noise_estimate_update
* Purpose : Update of background noise estimate
*
*******************************************************************************/
static void noise_estimate_update(
VadVars * st, /* i/o : State structure */
Word16 level[] /* i : sub-band levels of the input frame */
)
{
Word32 i;
Word16 alpha_up, alpha_down, bckr_add = 2;
/* Control update of bckr_est[] */
update_cntrl(st, level);
/* Choose update speed */
if ((0x7800 & st->vadreg) == 0)
{
alpha_up = ALPHA_UP1;
alpha_down = ALPHA_DOWN1;
} else
{
if (st->stat_count == 0)
{
alpha_up = ALPHA_UP2;
alpha_down = ALPHA_DOWN2;
} else
{
alpha_up = 0;
alpha_down = ALPHA3;
bckr_add = 0;
}
}
/* Update noise estimate (bckr_est) */
for (i = 0; i < COMPLEN; i++)
{
Word16 temp;
temp = (st->old_level[i] - st->bckr_est[i]);
if (temp < 0)
{ /* update downwards */
st->bckr_est[i] = add1(-2, add(st->bckr_est[i],vo_mult_r(alpha_down, temp)));
/* limit minimum value of the noise estimate to NOISE_MIN */
if(st->bckr_est[i] < NOISE_MIN)
{
st->bckr_est[i] = NOISE_MIN;
}
} else
{ /* update upwards */
st->bckr_est[i] = add1(bckr_add, add1(st->bckr_est[i],vo_mult_r(alpha_up, temp)));
/* limit maximum value of the noise estimate to NOISE_MAX */
if(st->bckr_est[i] > NOISE_MAX)
{
st->bckr_est[i] = NOISE_MAX;
}
}
}
/* Update signal levels of the previous frame (old_level) */
for (i = 0; i < COMPLEN; i++)
{
st->old_level[i] = level[i];
}
}
/******************************************************************************
*
* Function : vad_decision
* Purpose : Calculates VAD_flag
*
*******************************************************************************/
static Word16 vad_decision( /* return value : VAD_flag */
VadVars * st, /* i/o : State structure */
Word16 level[COMPLEN], /* i : sub-band levels of the input frame */
Word32 pow_sum /* i : power of the input frame */
)
{
Word32 i;
Word32 L_snr_sum;
Word32 L_temp;
Word16 vad_thr, temp, noise_level;
Word16 low_power_flag;
Word16 hang_len, burst_len;
Word16 ilog2_speech_level, ilog2_noise_level;
Word16 temp2;
/* Calculate squared sum of the input levels (level) divided by the background noise components
* (bckr_est). */
L_snr_sum = 0;
for (i = 0; i < COMPLEN; i++)
{
Word16 exp;
exp = norm_s(st->bckr_est[i]);
temp = (st->bckr_est[i] << exp);
temp = div_s((level[i] >> 1), temp);
temp = shl(temp, (exp - (UNIRSHFT - 1)));
L_snr_sum = L_mac(L_snr_sum, temp, temp);
}
/* Calculate average level of estimated background noise */
L_temp = 0;
for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
{
L_temp = vo_L_add(L_temp, st->bckr_est[i]);
}
noise_level = extract_h((L_temp << 12));
/* if SNR is lower than a threshold (MIN_SPEECH_SNR), and increase speech_level */
temp = vo_mult(noise_level, MIN_SPEECH_SNR) << 3;
if(st->speech_level < temp)
{
st->speech_level = temp;
}
ilog2_noise_level = ilog2(noise_level);
/* If SNR is very poor, speech_level is probably corrupted by noise level. This is correctred by
* subtracting MIN_SPEECH_SNR*noise_level from speech level */
ilog2_speech_level = ilog2(st->speech_level - temp);
temp = add1(vo_mult(NO_SLOPE, (ilog2_noise_level - NO_P1)), THR_HIGH);
temp2 = add1(SP_CH_MIN, vo_mult(SP_SLOPE, (ilog2_speech_level - SP_P1)));
if (temp2 < SP_CH_MIN)
{
temp2 = SP_CH_MIN;
}
if (temp2 > SP_CH_MAX)
{
temp2 = SP_CH_MAX;
}
vad_thr = temp + temp2;
if(vad_thr < THR_MIN)
{
vad_thr = THR_MIN;
}
/* Shift VAD decision register */
st->vadreg = (st->vadreg >> 1);
/* Make intermediate VAD decision */
if(L_snr_sum > vo_L_mult(vad_thr, (512 * COMPLEN)))
{
st->vadreg = (Word16) (st->vadreg | 0x4000);
}
/* check if the input power (pow_sum) is lower than a threshold" */
if(pow_sum < VAD_POW_LOW)
{
low_power_flag = 1;
} else
{
low_power_flag = 0;
}
/* Update background noise estimates */
noise_estimate_update(st, level);
/* Calculate values for hang_len and burst_len based on vad_thr */
hang_len = add1(vo_mult(HANG_SLOPE, (vad_thr - HANG_P1)), HANG_HIGH);
if(hang_len < HANG_LOW)
{
hang_len = HANG_LOW;
}
burst_len = add1(vo_mult(BURST_SLOPE, (vad_thr - BURST_P1)), BURST_HIGH);
return (hangover_addition(st, low_power_flag, hang_len, burst_len));
}
/******************************************************************************
*
* Function : Estimate_Speech()
* Purpose : Estimate speech level
*
* Maximum signal level is searched and stored to the variable sp_max.
* The speech frames must locate within SP_EST_COUNT number of frames.
* Thus, noisy frames having occasional VAD = "1" decisions will not
* affect to the estimated speech_level.
*
*******************************************************************************/
static void Estimate_Speech(
VadVars * st, /* i/o : State structure */
Word16 in_level /* level of the input frame */
)
{
Word16 alpha;
/* if the required activity count cannot be achieved, reset counters */
if((st->sp_est_cnt - st->sp_max_cnt) > (SP_EST_COUNT - SP_ACTIVITY_COUNT))
{
st->sp_est_cnt = 0;
st->sp_max = 0;
st->sp_max_cnt = 0;
}
st->sp_est_cnt += 1;
if (((st->vadreg & 0x4000)||(in_level > st->speech_level)) && (in_level > MIN_SPEECH_LEVEL1))
{
/* update sp_max */
if(in_level > st->sp_max)
{
st->sp_max = in_level;
}
st->sp_max_cnt += 1;
if(st->sp_max_cnt >= SP_ACTIVITY_COUNT)
{
Word16 tmp;
/* update speech estimate */
tmp = (st->sp_max >> 1); /* scale to get "average" speech level */
/* select update speed */
if(tmp > st->speech_level)
{
alpha = ALPHA_SP_UP;
} else
{
alpha = ALPHA_SP_DOWN;
}
if(tmp > MIN_SPEECH_LEVEL2)
{
st->speech_level = add1(st->speech_level, vo_mult_r(alpha, vo_sub(tmp, st->speech_level)));
}
/* clear all counters used for speech estimation */
st->sp_max = 0;
st->sp_max_cnt = 0;
st->sp_est_cnt = 0;
}
}
}
/******************************************************************************
*
* Function: wb_vad_init
* Purpose: Allocates state memory and initializes state memory
*
*******************************************************************************/
Word16 wb_vad_init( /* return: non-zero with error, zero for ok. */
VadVars ** state, /* i/o : State structure */
VO_MEM_OPERATOR *pMemOP
)
{
VadVars *s;
if (state == (VadVars **) NULL)
{
fprintf(stderr, "vad_init: invalid parameter\n");
return -1;
}
*state = NULL;
/* allocate memory */
if ((s = (VadVars *) mem_malloc(pMemOP, sizeof(VadVars), 32, VO_INDEX_ENC_AMRWB)) == NULL)
{
fprintf(stderr, "vad_init: can not malloc state structure\n");
return -1;
}
wb_vad_reset(s);
*state = s;
return 0;
}
/******************************************************************************
*
* Function: wb_vad_reset
* Purpose: Initializes state memory
*
*******************************************************************************/
Word16 wb_vad_reset( /* return: non-zero with error, zero for ok. */
VadVars * state /* i/o : State structure */
)
{
Word32 i, j;
if (state == (VadVars *) NULL)
{
fprintf(stderr, "vad_reset: invalid parameter\n");
return -1;
}
state->tone_flag = 0;
state->vadreg = 0;
state->hang_count = 0;
state->burst_count = 0;
state->hang_count = 0;
/* initialize memory used by the filter bank */
for (i = 0; i < F_5TH_CNT; i++)
{
for (j = 0; j < 2; j++)
{
state->a_data5[i][j] = 0;
}
}
for (i = 0; i < F_3TH_CNT; i++)
{
state->a_data3[i] = 0;
}
/* initialize the rest of the memory */
for (i = 0; i < COMPLEN; i++)
{
state->bckr_est[i] = NOISE_INIT;
state->old_level[i] = NOISE_INIT;
state->ave_level[i] = NOISE_INIT;
state->sub_level[i] = 0;
}
state->sp_est_cnt = 0;
state->sp_max = 0;
state->sp_max_cnt = 0;
state->speech_level = SPEECH_LEVEL_INIT;
state->prev_pow_sum = 0;
return 0;
}
/******************************************************************************
*
* Function: wb_vad_exit
* Purpose: The memory used for state memory is freed
*
*******************************************************************************/
void wb_vad_exit(
VadVars ** state, /* i/o : State structure */
VO_MEM_OPERATOR *pMemOP
)
{
if (state == NULL || *state == NULL)
return;
/* deallocate memory */
mem_free(pMemOP, *state, VO_INDEX_ENC_AMRWB);
*state = NULL;
return;
}
/******************************************************************************
*
* Function : wb_vad_tone_detection
* Purpose : Search maximum pitch gain from a frame. Set tone flag if
* pitch gain is high. This is used to detect
* signaling tones and other signals with high pitch gain.
*
*******************************************************************************/
void wb_vad_tone_detection(
VadVars * st, /* i/o : State struct */
Word16 p_gain /* pitch gain */
)
{
/* update tone flag */
st->tone_flag = (st->tone_flag >> 1);
/* if (pitch_gain > TONE_THR) set tone flag */
if (p_gain > TONE_THR)
{
st->tone_flag = (Word16) (st->tone_flag | 0x4000);
}
}
/******************************************************************************
*
* Function : wb_vad
* Purpose : Main program for Voice Activity Detection (VAD) for AMR
*
*******************************************************************************/
Word16 wb_vad( /* Return value : VAD Decision, 1 = speech, 0 = noise */
VadVars * st, /* i/o : State structure */
Word16 in_buf[] /* i : samples of the input frame */
)
{
Word16 level[COMPLEN];
Word32 i;
Word16 VAD_flag, temp;
Word32 L_temp, pow_sum;
/* Calculate power of the input frame. */
L_temp = 0L;
for (i = 0; i < FRAME_LEN; i++)
{
L_temp = L_mac(L_temp, in_buf[i], in_buf[i]);
}
/* pow_sum = power of current frame and previous frame */
pow_sum = L_add(L_temp, st->prev_pow_sum);
/* save power of current frame for next call */
st->prev_pow_sum = L_temp;
/* If input power is very low, clear tone flag */
if (pow_sum < POW_TONE_THR)
{
st->tone_flag = (Word16) (st->tone_flag & 0x1fff);
}
/* Run the filter bank and calculate signal levels at each band */
filter_bank(st, in_buf, level);
/* compute VAD decision */
VAD_flag = vad_decision(st, level, pow_sum);
/* Calculate input level */
L_temp = 0;
for (i = 1; i < COMPLEN; i++) /* ignore lowest band */
{
L_temp = vo_L_add(L_temp, level[i]);
}
temp = extract_h(L_temp << 12);
Estimate_Speech(st, temp); /* Estimate speech level */
return (VAD_flag);
}