/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vad_core.h"
#include "signal_processing_library.h"
#include "typedefs.h"
#include "vad_filterbank.h"
#include "vad_gmm.h"
#include "vad_sp.h"
// Spectrum Weighting
static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
static const int16_t kNoiseUpdateConst = 655; // Q15
static const int16_t kSpeechUpdateConst = 6554; // Q15
static const int16_t kBackEta = 154; // Q8
// Minimum difference between the two models, Q5
static const int16_t kMinimumDifference[kNumChannels] = {
544, 544, 576, 576, 576, 576 };
// Upper limit of mean value for speech model, Q7
static const int16_t kMaximumSpeech[kNumChannels] = {
11392, 11392, 11520, 11520, 11520, 11520 };
// Minimum value for mean value
static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
// Upper limit of mean value for noise model, Q7
static const int16_t kMaximumNoise[kNumChannels] = {
9216, 9088, 8960, 8832, 8704, 8576 };
// Start values for the Gaussian models, Q7
// Weights for the two Gaussians for the six channels (noise)
static const int16_t kNoiseDataWeights[kTableSize] = {
34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
// Weights for the two Gaussians for the six channels (speech)
static const int16_t kSpeechDataWeights[kTableSize] = {
48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
// Means for the two Gaussians for the six channels (noise)
static const int16_t kNoiseDataMeans[kTableSize] = {
6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
// Means for the two Gaussians for the six channels (speech)
static const int16_t kSpeechDataMeans[kTableSize] = {
8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
};
// Stds for the two Gaussians for the six channels (noise)
static const int16_t kNoiseDataStds[kTableSize] = {
378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
// Stds for the two Gaussians for the six channels (speech)
static const int16_t kSpeechDataStds[kTableSize] = {
555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
// Constants used in GmmProbability().
//
// Maximum number of counted speech (VAD = 1) frames in a row.
static const int16_t kMaxSpeechFrames = 6;
// Minimum standard deviation for both speech and noise.
static const int16_t kMinStd = 384;
// Constants in WebRtcVad_InitCore().
// Default aggressiveness mode.
static const short kDefaultMode = 0;
static const int kInitCheck = 42;
// Constants used in WebRtcVad_set_mode_core().
//
// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
//
// Mode 0, Quality.
static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
// Mode 1, Low bitrate.
static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
// Mode 2, Aggressive.
static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
// Mode 3, Very aggressive.
static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
// Calculates the weighted average w.r.t. number of Gaussians. The |data| are
// updated with an |offset| before averaging.
//
// - data [i/o] : Data to average.
// - offset [i] : An offset added to |data|.
// - weights [i] : Weights used for averaging.
//
// returns : The weighted average.
static int32_t WeightedAverage(int16_t* data, int16_t offset,
const int16_t* weights) {
int k;
int32_t weighted_average = 0;
for (k = 0; k < kNumGaussians; k++) {
data[k * kNumChannels] += offset;
weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
}
return weighted_average;
}
// Calculates the probabilities for both speech and background noise using
// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
// type of signal is most probable.
//
// - self [i/o] : Pointer to VAD instance
// - features [i] : Feature vector of length |kNumChannels|
// = log10(energy in frequency band)
// - total_power [i] : Total power in audio frame.
// - frame_length [i] : Number of input samples
//
// - returns : the VAD decision (0 - noise, 1 - speech).
static int16_t GmmProbability(VadInstT* self, int16_t* features,
int16_t total_power, int frame_length) {
int channel, k;
int16_t feature_minimum;
int16_t h0, h1;
int16_t log_likelihood_ratio;
int16_t vadflag = 0;
int16_t shifts_h0, shifts_h1;
int16_t tmp_s16, tmp1_s16, tmp2_s16;
int16_t diff;
int gaussian;
int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
int16_t delt, ndelt;
int16_t maxspe, maxmu;
int16_t deltaN[kTableSize], deltaS[kTableSize];
int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
int32_t h0_test, h1_test;
int32_t tmp1_s32, tmp2_s32;
int32_t sum_log_likelihood_ratios = 0;
int32_t noise_global_mean, speech_global_mean;
int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
int16_t overhead1, overhead2, individualTest, totalTest;
// Set various thresholds based on frame lengths (80, 160 or 240 samples).
if (frame_length == 80) {
overhead1 = self->over_hang_max_1[0];
overhead2 = self->over_hang_max_2[0];
individualTest = self->individual[0];
totalTest = self->total[0];
} else if (frame_length == 160) {
overhead1 = self->over_hang_max_1[1];
overhead2 = self->over_hang_max_2[1];
individualTest = self->individual[1];
totalTest = self->total[1];
} else {
overhead1 = self->over_hang_max_1[2];
overhead2 = self->over_hang_max_2[2];
individualTest = self->individual[2];
totalTest = self->total[2];
}
if (total_power > kMinEnergy) {
// The signal power of current frame is large enough for processing. The
// processing consists of two parts:
// 1) Calculating the likelihood of speech and thereby a VAD decision.
// 2) Updating the underlying model, w.r.t., the decision made.
// The detection scheme is an LRT with hypothesis
// H0: Noise
// H1: Speech
//
// We combine a global LRT with local tests, for each frequency sub-band,
// here defined as |channel|.
for (channel = 0; channel < kNumChannels; channel++) {
// For each channel we model the probability with a GMM consisting of
// |kNumGaussians|, with different means and standard deviations depending
// on H0 or H1.
h0_test = 0;
h1_test = 0;
for (k = 0; k < kNumGaussians; k++) {
gaussian = channel + k * kNumChannels;
// Probability under H0, that is, probability of frame being noise.
// Value given in Q27 = Q7 * Q20.
tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
self->noise_means[gaussian],