/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file includes the implementation of the core functionality in VAD.
* For function description, see vad_core.h.
*/
#include "vad_core.h"
#include "signal_processing_library.h"
#include "typedefs.h"
#include "vad_defines.h"
#include "vad_filterbank.h"
#include "vad_gmm.h"
#include "vad_sp.h"
// Spectrum Weighting
static const WebRtc_Word16 kSpectrumWeight[6] = { 6, 8, 10, 12, 14, 16 };
static const WebRtc_Word16 kNoiseUpdateConst = 655; // Q15
static const WebRtc_Word16 kSpeechUpdateConst = 6554; // Q15
static const WebRtc_Word16 kBackEta = 154; // Q8
// Minimum difference between the two models, Q5
static const WebRtc_Word16 kMinimumDifference[6] = {
544, 544, 576, 576, 576, 576 };
// Upper limit of mean value for speech model, Q7
static const WebRtc_Word16 kMaximumSpeech[6] = {
11392, 11392, 11520, 11520, 11520, 11520 };
// Minimum value for mean value
static const WebRtc_Word16 kMinimumMean[2] = { 640, 768 };
// Upper limit of mean value for noise model, Q7
static const WebRtc_Word16 kMaximumNoise[6] = {
9216, 9088, 8960, 8832, 8704, 8576 };
// Start values for the Gaussian models, Q7
// Weights for the two Gaussians for the six channels (noise)
static const WebRtc_Word16 kNoiseDataWeights[12] = {
34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
// Weights for the two Gaussians for the six channels (speech)
static const WebRtc_Word16 kSpeechDataWeights[12] = {
48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
// Means for the two Gaussians for the six channels (noise)
static const WebRtc_Word16 kNoiseDataMeans[12] = {
6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
// Means for the two Gaussians for the six channels (speech)
static const WebRtc_Word16 kSpeechDataMeans[12] = {
8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
};
// Stds for the two Gaussians for the six channels (noise)
static const WebRtc_Word16 kNoiseDataStds[12] = {
378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
// Stds for the two Gaussians for the six channels (speech)
static const WebRtc_Word16 kSpeechDataStds[12] = {
555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
static const int kInitCheck = 42;
// Initialize VAD
int WebRtcVad_InitCore(VadInstT *inst, short mode)
{
int i;
// Initialization of struct
inst->vad = 1;
inst->frame_counter = 0;
inst->over_hang = 0;
inst->num_of_speech = 0;
// Initialization of downsampling filter state
inst->downsampling_filter_states[0] = 0;
inst->downsampling_filter_states[1] = 0;
inst->downsampling_filter_states[2] = 0;
inst->downsampling_filter_states[3] = 0;
// Read initial PDF parameters
for (i = 0; i < NUM_TABLE_VALUES; i++)
{
inst->noise_means[i] = kNoiseDataMeans[i];
inst->speech_means[i] = kSpeechDataMeans[i];
inst->noise_stds[i] = kNoiseDataStds[i];
inst->speech_stds[i] = kSpeechDataStds[i];
}
// Index and Minimum value vectors are initialized
for (i = 0; i < 16 * NUM_CHANNELS; i++)
{
inst->low_value_vector[i] = 10000;
inst->index_vector[i] = 0;
}
for (i = 0; i < 5; i++)
{
inst->upper_state[i] = 0;
inst->lower_state[i] = 0;
}
for (i = 0; i < 4; i++)
{
inst->hp_filter_state[i] = 0;
}
// Init mean value memory, for FindMin function
inst->mean_value[0] = 1600;
inst->mean_value[1] = 1600;
inst->mean_value[2] = 1600;
inst->mean_value[3] = 1600;
inst->mean_value[4] = 1600;
inst->mean_value[5] = 1600;
if (mode == 0)
{
// Quality mode
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_Q;
inst->individual[1] = INDIVIDUAL_20MS_Q;
inst->individual[2] = INDIVIDUAL_30MS_Q;
inst->total[0] = TOTAL_10MS_Q;
inst->total[1] = TOTAL_20MS_Q;
inst->total[2] = TOTAL_30MS_Q;
} else if (mode == 1)
{
// Low bitrate mode
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_LBR;
inst->individual[1] = INDIVIDUAL_20MS_LBR;
inst->individual[2] = INDIVIDUAL_30MS_LBR;
inst->total[0] = TOTAL_10MS_LBR;
inst->total[1] = TOTAL_20MS_LBR;
inst->total[2] = TOTAL_30MS_LBR;
} else if (mode == 2)
{
// Aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_AGG;
inst->individual[1] = INDIVIDUAL_20MS_AGG;
inst->individual[2] = INDIVIDUAL_30MS_AGG;
inst->total[0] = TOTAL_10MS_AGG;
inst->total[1] = TOTAL_20MS_AGG;
inst->total[2] = TOTAL_30MS_AGG;
} else
{
// Very aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_VAG;
inst->individual[1] = INDIVIDUAL_20MS_VAG;
inst->individual[2] = INDIVIDUAL_30MS_VAG;
inst->total[0] = TOTAL_10MS_VAG;
inst->total[1] = TOTAL_20MS_VAG;
inst->total[2] = TOTAL_30MS_VAG;
}
inst->init_flag = kInitCheck;
return 0;
}
// Set aggressiveness mode
int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
{
if (mode == 0)
{
// Quality mode
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
webrtc_vad_extract.zip (67个子文件)
webrtc_vad_extract
division_operations.c 4KB
downsample_fast.c 2KB
cross_correlation.o 5KB
complex_fft.o 14KB
spl_sqrt.o 6KB
energy.c 1KB
complex_bit_reverse.c 1KB
vad_sp.c 6KB
vad_core.o 35KB
get_hanning_window.o 5KB
vector_scaling_operations.o 12KB
get_hanning_window.c 3KB
get_scaling_square.c 1KB
fft.o 31KB
division_operations.o 7KB
webrtc_vad.o 12KB
cross_correlation.c 8KB
downsample_fast.o 6KB
spl_sqrt_floor.o 5KB
spl_sqrt_floor.c 2KB
spl_sqrt.c 5KB
spl_version.o 5KB
vad_filterbank.c 10KB
complex_fft.c 19KB
spl_version.c 764B
webrtc_vad.c 4KB
vad_core.c 27KB
vad_gmm.c 3KB
Android.mk 2KB
libs
shared_file_no_found~ 119B
libwebrtc_vad_my.so 142KB
shared_file_no_found 119B
min_max_operations.c 7KB
makefile 2KB
vector_scaling_operations.c 4KB
fft.c 26KB
vad_gmm.o 6KB
min_max_operations.o 13KB
test_code
vad_test.c 5KB
speech_noisy.wav 1.1MB
q-0dB-15.wav 879KB
vad_test 13KB
audio_1
Untitled Document~ 1KB
compile_order 70B
speech_noisy_20cm.wav 1.26MB
speech.wav 690KB
q-10dB-15.wav 827KB
compile_order~ 70B
vad_filterbank.o 20KB
get_scaling_square.o 13KB
complex_bit_reverse.o 4KB
energy.o 5KB
include
fft.h 2KB
webrtc_vad.h 5KB
vad_defines.h 3KB
isac.h 25KB
vad_core.h 4KB
signal_processing_library.h 59KB
structs.h 13KB
structs.h~ 12KB
vad_gmm.h 1KB
typedefs.h 5KB
vad_sp.h 2KB
spl_inl.h 4KB
settings.h 8KB
vad_filterbank.h 5KB
vad_sp.o 11KB
共 67 条
- 1
资源评论
- 呼拉z2018-07-13可以指导一下,我怎么使用这个demo吗??最近在研究vad算法
- qingheli1232019-04-03还可以!!
- 鲁峰20122023-10-21整体还不错的 挺不错的
- 良心不安2019-07-22库可以编译和使用,例子程序需要自己写makefile
- daad7772020-04-23可以用 给力的
鹿克同学
- 粉丝: 5
- 资源: 14
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- 机器学习和数据挖掘课程设计-米其林餐厅数据挖掘管理系统源码+使用文档说明.zip
- html html html展示我与ai的对化
- 数据结构课程设计-全国交通出行咨询模拟系统C语言实现源码.zip
- cef-binary-109.0.1+gcd5e37a+chromium-109.0.5414.8-windows32
- 基于C语言的全国交通咨询系统模拟源码.zip
- 正点原子HAL库 STM32F4 DMA(学习自用附源码)
- 炫酷代码雨,超级炫酷哦!!!
- 基于物联网MQTT协议的智能停车场管理系统
- POETIZE个人博客系统源码 - 最美博客
- 基于深度学习的行人检测系统源码+项目说明(YoloV3+Tensorflow).zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功