/*
* vad.cpp
*
* This is a simple vad implementation. It isn't tuned and testing
* has been somewhat limited... but it seems to work ok. All numbers
* that are db are multiplied by 100 to keep them slightly more
* accurate and easy to use. (so -7450 is -74.50db) This code use
* three functions not provided as part of this file : log10_32, bqInit
* and bqProcess. The log function calculates :
* 100 * 10 * log(x) - 9333
* which sould be an approximation of the signal energy x 100.
* bqInit initializes a biquad data structure and bqProcess applies a
* a biquad to a signal buffer. A biquad is used by the vad to filter
* out lower frequency background noise.
*
* This code operates under the assumption that it will get frames with
* lengths which are a multiple of 5ms. This is relatively easy to
* change since the only real dependency is the hangover count -- its in
* frames now but could easily be changed to samples.
*
*/
#define VAD_CPP
#include "c_utils.h"
#include "c_vad.h"
#define LOCAL static
#define DATA_FRAME_LENGTH (5*8)
/*
* x = 1.5sec * 1000ms/sec * 8 samples/1 ms
* samples => +1dB every x samples
*/
#define VAD_NOISEFLOOR_CNT_INIT (int)(8*1500)
#define VAD_SIGNALMAX_CNT_INIT (int)(8*1500)
/* Power Thresholds */
#define VAD_NOISE_TH_BASE (float) 10.00 /* 10.00 dB Noise Threshold */
#define VAD_NOISE_FLOOR_INIT (float)-74.00 /* -74.00 dB Initial Noise Floor */
#define VAD_SIGNAL_MAX_INIT (float)-80.00 /* -80.00 dB Initial Noise Max */
#define VAD_NOISE_TH_MIN (float) 1.00 /* 1.00 dB Minimum Noise Threshold */
/* High Pass Filter for getting rid of background noise from
* input signal before energy calculations */
/* Butter : */
#define vhpfB0 (S2byte) 14339
#define vhpfB1 (S2byte)-28678
#define vhpfB2 (S2byte) 14339
#define vhpfA1 (S2byte)-28422
#define vhpfA2 (S2byte) 12550
/* Number of samples of silence before we declare silence period */
/* #samples = 8 samples/ms * 500ms */
#define VAD_HANGOVER_CNT_INIT (int)(8*500)
typedef enum {
VadState_Silence = 0,
VadState_Speech,
VadState_Unknown
} t_VadState;
typedef struct _vad {
boolean enabled;
/* Saved STA between input frames */
U4byte sta;
/* state == 1 if VOICE
* state == 0 if SILENCE */
t_VadState state;
/* Countdown of consecutive frames before we declare silence */
int hangoverCnt;
/* Threshold above which a signal is considered to be speech */
float noiseTH;
/* Countdown after which the noise floor is
* incremented by 1dB */
int noiseFloorCnt;
/* Noise floor in dB */
float noiseFloor;
/* Countdown after which the signal max is
* decremented by 1dB */
int signalMaxCnt;
/* Signal max in dB */
float signalMax;
/* STARise == 1 if sta is rising
* STARise == 0 if sta is falling */
int STARise;
int stateTxCount;
/* High Pass Filter for input signal */
t_biquad *bq;
} t_vad;
/* LOCAL */
LOCAL t_biquad vadbq;
LOCAL t_vad vadd;
void
vadInit()
{
vadd.enabled = TRUE;
vadd.bq = &vadbq;
vadd.sta = 10000;
vadd.noiseTH = VAD_NOISE_TH_BASE;
vadd.state = VadState_Unknown;
vadd.noiseFloorCnt = VAD_NOISEFLOOR_CNT_INIT;
vadd.noiseFloor = VAD_NOISE_FLOOR_INIT;
vadd.hangoverCnt = VAD_HANGOVER_CNT_INIT;
vadd.STARise = 1;
vadd.stateTxCount = 0;
vadd.signalMax = VAD_SIGNAL_MAX_INIT;
vadd.signalMaxCnt = VAD_SIGNALMAX_CNT_INIT;
bqInit(vadd.bq, vhpfB0, vhpfB1, vhpfB2, vhpfA1, vhpfA2);
}
LOCAL U4byte
computeSTA(S2byte *pdata, int length, U4byte *minSta)
{
int i;
S4byte acc0,acc1;
U4byte maxSta;
*minSta = vadd.sta;
maxSta = vadd.sta;
for (i = 0; i < length; i++)
{
/* q.15 * q.15 = q.30 */
acc1 = pdata[i] * pdata[i];
if ( vadd.STARise )
{
acc0 = -1 * (S4byte)(vadd.sta >> 6);
acc1 = acc1 >> 5;
}
else
{
acc0 = -1 * (S4byte)(vadd.sta >> 9);
acc1 = acc1 >> 8;
} /* if */
acc0 += acc1;
vadd.STARise = ( 0 >= acc0 ) ? 0 : 1;
vadd.sta += acc0;
if ( vadd.sta > maxSta )
{
maxSta = vadd.sta; // arijit - i added the cast
}
else if ( vadd.sta < *minSta )
{
*minSta = vadd.sta;
}
} /* for */
return maxSta;
}
LOCAL void
computeNFE(float minpower, float maxpower, int length)
{
if ( minpower <= vadd.noiseFloor )
{
vadd.noiseFloor = minpower;
vadd.noiseFloorCnt = VAD_NOISEFLOOR_CNT_INIT;
}
else
{
if ( vadd.noiseFloorCnt < length )
{
vadd.noiseFloor += 1;
vadd.noiseFloorCnt =
(VAD_NOISEFLOOR_CNT_INIT + vadd.noiseFloorCnt - length);
}
else
{
vadd.noiseFloorCnt -= length;
}
}
}
unsigned long stopCount = 32000;
LOCAL boolean
vadSubProcess(S2byte *data, int length)
{
boolean SpeechDetected;
boolean FrameSpeechFlag;
S2byte tmpData[DATA_FRAME_LENGTH];
U4byte sta[2];
float power[2];
static unsigned long count = 0;
SpeechDetected = TRUE;
FrameSpeechFlag = FALSE;
bqProcess(vadd.bq, data, tmpData, length);
sta[1] = computeSTA(tmpData, length, &sta[0]);
calcPower(2, sta, power);
computeNFE(power[0], power[1], length);
count += length;
if (count >= stopCount)
{
count = 0;
}
if (power[1] > (vadd.noiseFloor + vadd.noiseTH))
{
FrameSpeechFlag = TRUE;
}
if ( FrameSpeechFlag == FALSE)
{
if ( vadd.hangoverCnt < length )
{
SpeechDetected = FALSE;
vadd.hangoverCnt = 0;
if ( vadd.state != VadState_Silence )
{
vadd.stateTxCount++;
}
vadd.state = VadState_Silence;
}
else
{
vadd.hangoverCnt -= length;
}
}
else
{
vadd.hangoverCnt = VAD_HANGOVER_CNT_INIT;
if ( vadd.state == VadState_Silence )
{
vadd.stateTxCount++;
}
vadd.state = VadState_Speech;
}
return SpeechDetected;
}
/*
* Returns: true for speech
* false for silence
*/
boolean
vadProcess(S2byte *data, int length)
{
/* vadProcess locals */
int idx;
int step;
boolean ret;
ret = FALSE;
if ( vadd.enabled == TRUE )
{
/* Cut up the frame into 5ms chunks for processing purposes */
for (idx = 0; length > 0; length -= step)
{
step = (length < DATA_FRAME_LENGTH) ? length : DATA_FRAME_LENGTH;
ret |= vadSubProcess(&data[idx], step);
idx += step;
}
}
else
{
ret = TRUE;
}
return ret;
}