%% Speech Enhancement For Machinery Noise
% The University of Texas at Dallas
% Statistical Signal Processing Research Laboratory
% Yu Rao Copyright (c)2015Feb
close all;clear all;clc;
snr_level = 0;
% noisy_path = ['../../Database/IEEE_corpus_noisy',num2str(snr_level),'/noisy']; % SSPRL
% clean_path = '../../Database/IEEE_corpus_8k/';
cmp_path = '../../Database/For_Comparison/';
noisy_path = ['./IEEE_corpus_noisy',num2str(snr_level),'/noisy']; % My PC
clean_path = './IEEE_corpus_8k/'; % My PC
n = 1; k = 10;
if n<10
path_1 = [noisy_path,num2str(snr_level),'_S_0',num2str(n)];
path_clean = [clean_path,'clean_S_0',num2str(n)];
else
path_1 = [noisy_path,num2str(snr_level),'_S_',num2str(n)];
path_clean = [clean_path,'clean_S_',num2str(n)];
end
if k<10
noisy_file = [path_1,'_0',num2str(k),'.wav'];
clean_file = [path_clean,'_0',num2str(k),'.wav'];
else
noisy_file = [path_1,'_',num2str(k),'.wav'];
clean_file = [path_clean,'_',num2str(k),'.wav'];
end
% noisy_file = './sp01_babble_sn0.wav';
clearvars path_1 snr_level n k path_clean noisy_path clean_path
% noisy_file = ['Noisy_IEEE',num2str(snr_level),'.wav'];
[noisy,Fs] = audioread(noisy_file);
frame_dur = 0.032; % 32 ms Frame duration
framesize = frame_dur*Fs; % Number of samples in One Frame (AFD)
overlap = 0.5*framesize; % 50% Overlap
shiftsize = framesize - overlap; % Acoustic Frame shift (AFS)
frame_win = (hamming(framesize))'; % Window in Time Domain
mod_dur = 0.064; % 256 ms modulation frame duration
modframesize = mod_dur*Fs/shiftsize; % Modulation frame size
modshift_dur = 0.016; % Modulation Frame Shift
modframeshift = modshift_dur*Fs/shiftsize; % Modulation frame shift size
sys_win = 1./(frame_win(1:overlap)+frame_win(overlap+1:framesize));
inputbuffer = zeros(1,framesize); % Input Buffer
oldbuffer = zeros(1,framesize); % Input Buffer
outputbuffer = zeros(1,shiftsize); % Output Buffer
a_nfft = 2*2^nextpow2(framesize); % Number of FFT points
m_nfft = 2*2^nextpow2(modframesize); % Number of FFT points in Modulation Domain
mod_win = hamming(modframesize); % Modulation frame window
% mod_win = ones(1,modframesize);
mod_sys = 0;
nn = modframesize/modframeshift;
tmp_idx = 0;
for nni = 1:nn
mod_sys = mod_sys+mod_win(tmp_idx+1);
tmp_idx = tmp_idx+modframeshift;
end
mod_sys = 1/mod_sys;
X_phase = zeros(2*modframesize+1,a_nfft); % Long Buffer for phase info
X_mag = zeros(2*modframesize+1,a_nfft); % Long Buffer for magnitude info
Z = zeros(2*modframesize+1,a_nfft,m_nfft); % Long Buffer for Synthesis
z = zeros(1,a_nfft);
mod_phase = zeros(a_nfft,m_nfft); % Modulation Spectrum Phase Response
mod_mag = zeros(a_nfft,m_nfft); % Modulation Spectrum Magnitude Response
D_est = zeros(a_nfft,m_nfft); % Estimated Noise Power (r = 2)
D_est_old = zeros(a_nfft,m_nfft); % Previous Estimated Noide Power (r = 2)
S_est = zeros(a_nfft,m_nfft); % Estimated Clean Signal
coef = zeros(1,a_nfft);
total_frame = ceil(length(noisy)/shiftsize);
final_out = zeros(length(noisy),1); % Final Output for verification
vad_flag = zeros(total_frame,a_nfft);
vv = 0;
vadini_frame = nn; % For Noise Estimation
p = 1;
beta = 0.002;
lambda = 0.98;
mlambda = 1-lambda;
teta = 3; % Threshold (dB)
for frame_idx = 1:total_frame
temp_idx = (frame_idx-1)*shiftsize;
inputbuffer(1:overlap) = inputbuffer(overlap+1:framesize);
if frame_idx ~= total_frame
inputbuffer(overlap+1:framesize) = noisy(temp_idx+1:temp_idx+shiftsize);
else
lastLen = length(noisy(temp_idx+1:end));
inputbuffer(overlap+1:overlap+lastLen) = noisy(temp_idx+1:end);
inputbuffer(overlap+lastLen:end) = 0;
end
x = inputbuffer.*frame_win;
X = fft(x,a_nfft);
X_phase(1:end-1,:) = X_phase(2:end,:);
X_mag(1:end-1,:) = X_mag(2:end,:);
X_phase(end,:) = angle(X);
X_mag(end,:) = abs(X);
Z(1:end-1,:,:) = Z(2:end,:,:);
if frame_idx<=vadini_frame+vv % Assume the first six frames are silent
if frame_idx >= vadini_frame
for k = 1:a_nfft
Y = fft(X_mag(modframesize+2:end,k).*mod_win,m_nfft);
mod_phase(k,:) = angle(Y);
mod_mag(k,:) = abs(Y);
for m = 1:m_nfft
D_est(k,m) = D_est(k,m)+mod_mag(k,m)^2;
end
end
end
if frame_idx == vadini_frame+vv
D_est_old(k,m) = D_est(k,m)/(vv+1);
end
outputbuffer = inputbuffer(overlap+1:end); % No speech Enhancement Stage
else
for k = 1:a_nfft
Y = fft(X_mag(modframesize+2:end,k).*mod_win,m_nfft);
mod_phase(k,:) = angle(Y);
mod_mag(k,:) = abs(Y);
fi = 10*log10(sum(mod_mag(k,:).^2)/sum(D_est_old(k,:)));
% Voice Activity Detector
if fi<3
for m = 1:m_nfft
D_est(k,m) = lambda*D_est_old(k,m)+mlambda*(mod_mag(k,m)^2);
end
vad_flag(frame_idx,k) = 0; % For debugging
else
vad_flag(frame_idx,k) = 1; % For debugging
end
D_est_old(k,:) = D_est(k,:);
%% Spectral Subtraction
for m = 1:m_nfft
tempvar1 = mod_mag(k,m)^2-p*D_est(k,m);
tempvar2 = beta*D_est(k,m);
if tempvar1>=tempvar2
S_est(k,m) = sqrt(tempvar1);
else
S_est(k,m) = sqrt(tempvar2);
end
end
Z(end,k,:) = real(ifft(S_est(k,:).*exp(1i*mod_phase(k,:)),m_nfft)); % Enhanced Output
% Z(end,k,:) = real(ifft(mod_mag(k,:).*exp(1i*mod_phase(k,:)),m_nfft)); % Original Output
coef(k) = 0;
for nnj = 0:nn-1
coef(k) = coef(k)+Z(end-nnj,k,1+nnj);
end
coef(k) = coef(k)*mod_sys;
end
a_frame = real(ifft(coef.*exp(1i*X_phase(end-modframesize+1,:)),a_nfft));
outputbuffer = sys_win.*(a_frame(1:overlap)+oldbuffer(overlap+1:overlap+shiftsize));
oldbuffer = a_frame(1:a_nfft/2);
end
final_out(temp_idx+1:temp_idx+shiftsize) = outputbuffer;
end
total_length = length(final_out);
[cleany,~] = audioread(clean_file);
[ss_out] = modSS(noisy_file);
% [ss_out] = specsub(noisy_file);
subplot(4,1,1);plot(final_out);title('Enhanced');
subplot(4,1,2);plot(ss_out);title('Subectral Subtraction');
subplot(4,1,3);plot(cleany);title('Clean');
subplot(4,1,4);plot(noisy);title('Noisy');
%% PESQ Computation
AMS_enhanced = [cmp_path,'AMS_enhanced.wav'];
New_Clean = [cmp_path,'clean.wav'];
New_Noisy = [cmp_path,'noisy.wav'];
Ori_enhanced = [cmp_path,'Ori_enhanced.wav'];
audiowrite(AMS_enhanced,final_out(mod_dur*Fs+1:total_length),Fs);
audiowrite(New_Clean,cleany(1:total_length-mod_dur*Fs),Fs);
audiowrite(New_Noisy,noisy(1:total_length-mod_dur*Fs),Fs);
audiowrite(Ori_enhanced,ss_out(1:total_length-mod_dur*Fs),Fs);
pesq_noisy = pesq(New_Clean,New_Noisy);
ncm_noisy = NCM(New_Clean,New_Noisy);
pesq_orienhanced = pesq(New_Clean,Ori_enhanced);
ncm_orienhanced = NCM(New_Clean,Ori_enhanced);
pesq_newenhanced = pesq(New_Clean,AMS_enhanced);
ncm_newenhanced = NCM(New_Clean,AMS_enhanced);
display(pesq_noisy(1));display(pesq_orienhanced(1));display(pesq_newenhanced(1));
display(ncm_noisy);display(ncm_orienhanced);display(ncm_newenhanced);
figure;plot(cleany(1:total_length-mod_dur*Fs));hold on;
plot(final_out(mod_dur*Fs+1:total_length),'r');
% figure;plot(final_out(1025:total_length)-noisy(1:total_length-1024));
% sound(final_out,Fs); pause(3.5);
% sound(ss_out,Fs);