function [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, others] = endPointDetect(au, opt, showPlot)
% endPointDetect: EPD based on volume and HOD (high-order difference)
%
% Usage:
% [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, others] = endPointDetect(au, opt, showPlot)
%
% Description:
% [epInSampleIndex, epInFrameIndex, soundSegment, zeroOneVec, others] = endPointDetect(au, opt, showPlot)
% epInSampleIndex: two-element end-points in sample index
% epInFrameIndex: two-element end-points in frame index
% soundSegment: segment of voice activity
% zeroOneVec: zero-one vector for each frame
% others: other outputs, which depends on opt.method.
% au: audio object
% opt: parameters for EPD
% showPlot: 0 for silence operation, 1 for plotting
%
% Example:
% waveFile='SingaporeIsAFinePlace.wav';
% au=myAudioRead(waveFile);
% opt=endPointDetect('defaultOpt');
% opt.method='vol';
% showPlot = 1;
% [epInSampleIndex, epInFrameIndex, soundSegment] = endPointDetect(au, opt, showPlot);
% Category: Audio endpoint detection
% Roger Jang, 20070323
if nargin<1, selfdemo; return; end
if ischar(au) && strcmpi(au, 'defaultOpt') % Set the default options
epInSampleIndex.method='volZcr'; % Default method
epInSampleIndex.frameDuration=256/8000; % Frame size (fs=16000 ===> frameSize=256)
epInSampleIndex.overlapDuration=0; % Frame overlap
% The followings are mainly for method='vol'
epInSampleIndex.volRatio=0.1;
epInSampleIndex.vMinMaxPercentile=3;
% For method='volzcr';
epInSampleIndex.volRatio2=0.2; % Not used for now
epInSampleIndex.zcrRatio=0.1;
epInSampleIndex.zcrShiftGain=4;
% For epdByEntropy
epInSampleIndex.veRatio=0.1;
epInSampleIndex.veMinMaxPercentile=3;
% For method='volhod'
epInSampleIndex.vhRatio=0.012; % 0.11
epInSampleIndex.diffOrder=1;
epInSampleIndex.volWeight=0.76;
epInSampleIndex.vhMinMaxPercentile=2.3; % 5.0%
epInSampleIndex.extendNum=1; % Extend front and back
epInSampleIndex.minSegment=0.068; % Sound segments (in seconds) shorter than or equal to this value are removed
epInSampleIndex.maxSilBetweenSegment=0.416; %
epInSampleIndex.minLastWordDuration=0.2; %
return
end
if nargin<2||isempty(opt), opt=feval(mfilename, 'defaultOpt'); end
if nargin<3, showPlot=0; end
opt.frameSize=round(opt.frameDuration*au.fs);
opt.overlap=round(opt.overlapDuration*au.fs);
y=au.signal; fs=au.fs; nbits=au.nbits;
if size(y, 2)~=1, error('Wave is not mono!'); end
frameSize=opt.frameSize;
overlap=opt.overlap;
minSegment=round(opt.minSegment*fs/(frameSize-overlap));
maxSilBetweenSegment=round(opt.maxSilBetweenSegment*fs/(frameSize-overlap));
%minLastWordDuration=round(opt.minLastWordDuration*fs/(frameSize-overlap));
y=double(y); % convert to double data type
frameMat=enframe(y, frameSize, overlap); % frame blocking
frameMat=frameZeroMean(frameMat, 2); % zero justification
frameNum=size(frameMat, 2); % no. of frames
switch(lower(opt.method))
case 'vol'
volume=frame2volume(frameMat); % compute volume
temp=sort(volume);
index=round(frameNum*opt.vMinMaxPercentile/100); if index==0, index=1; end
volMin=temp(index);
volMax=temp(frameNum-index+1);
volTh=(volMax-volMin)*opt.volRatio+volMin; % compute volume threshold
% ====== Identify voiced part that's larger than volTh
soundSegment=segmentFind(volume>volTh);
% ====== Delete short sound clips
index=[soundSegment.duration]<=minSegment;
soundSegment(index)=[];
% ====== Create zero-one vector
zeroOneVec=logical(0*volume);
for i=1:length(soundSegment)
zeroOneVec(soundSegment(i).begin:soundSegment(i).end)=1;
end
if isempty(soundSegment)
epInSampleIndex=[];
epInFrameIndex=[];
fprintf('Warning: No sound segment found in %s.m.\n', mfilename);
else
epInFrameIndex=[soundSegment(1).begin, soundSegment(end).end];
epInSampleIndex=frame2sampleIndex(epInFrameIndex, frameSize, overlap); % conversion from frame index to sample index
for i=1:length(soundSegment),
soundSegment(i).beginSample = frame2sampleIndex(soundSegment(i).begin, frameSize, overlap);
soundSegment(i).endSample = min(length(y), frame2sampleIndex(soundSegment(i).end, frameSize, overlap));
soundSegment(i).beginFrame = soundSegment(i).begin;
soundSegment(i).endFrame = soundSegment(i).end;
end
soundSegment=rmfield(soundSegment, 'begin');
soundSegment=rmfield(soundSegment, 'end');
% soundSegment=rmfield(soundSegment, 'duration');
end
others.volume=volume; others.volTh=volTh;
case 'volzcr'
volume=frame2volume(frameMat); % compute volume
temp=sort(volume);
index=round(frameNum*opt.vMinMaxPercentile/100); if index==0, index=1; end
volMin=temp(index);
volMax=temp(frameNum-index+1); % To avoid unvoiced sounds
volTh1=(volMax-volMin)*opt.volRatio+volMin; % compute volume threshold
volTh2=(volMax-volMin)*opt.volRatio2+volMin; % compute volume threshold
% ====== Identify voiced part that's larger than volTh1
soundSegment=segmentFind(volume>volTh1);
% ====== Compute ZCR
[minVol, index]=min(volume);
shiftAmount=opt.zcrShiftGain*max(abs(frameMat(:,index))); % shiftAmount is equal to opt.zcrShiftGain times the max. abs. sample within the frame of min. volume
%shiftAmount=max(shiftAmount, 2);
shiftAmount=max(shiftAmount, max(frameMat(:))/100);
zcr=frame2zcr(frameMat, 1, shiftAmount);
zcrTh=max(zcr)*opt.zcrRatio;
% ====== Expansion 1: Expand end points to volume level1 (lower level)
for i=1:length(soundSegment),
head = soundSegment(i).begin;
while (head-1)>=1 & volume(head-1)>=volTh1,
head=head-1;
end
soundSegment(i).begin = head;
tail = soundSegment(i).end;
while (tail+1)<=length(volume) & volume(tail+1)>=volTh1,
tail=tail+1;
end
soundSegment(i).end = tail;
end
% ====== Expansion 2: Expand end points to include high zcr region
for i=1:length(soundSegment),
head = soundSegment(i).begin;
while (head-1)>=1 & zcr(head-1)>zcrTh % Extend at beginning
head=head-1;
end
soundSegment(i).begin = head;
tail = soundSegment(i).end;
while (tail+1)<=length(zcr) & zcr(tail+1)>zcrTh % Extend at ending
tail=tail+1;
end
soundSegment(i).end = tail;
end
% ====== Delete repeated sound segments
index = [];
for i=1:length(soundSegment)-1,
if soundSegment(i).begin==soundSegment(i+1).begin & soundSegment(i).end==soundSegment(i+1).end,
index=[index, i];
end
end
soundSegment(index) = [];
% ====== Delete short sound clips
index = [];
for i=1:length(soundSegment)
soundSegment(i).duration=soundSegment(i).end-soundSegment(i).begin+1; % This is necessary since the duration is changed due to expansion
if soundSegment(i).duration<=minSegment
index = [index, i];
end
end
soundSegment(index) = [];
zeroOneVec=logical(0*volume);
for i=1:length(soundSegment)
for j=soundSegment(i).begin:soundSegment(i).end
zeroOneVec(j)=1;
end
end
if isempty(soundSegment)
epInSampleIndex=[];
epInFrameIndex=[];
fprintf('Warning: No sound segment found in %s.m.\n', mfilename);
else
epInFrameIndex=[soundSegment(1).begin, soundSegment(end).end];
epInSampleIndex=frame2sampleIndex(epInFrameIndex, frameSize, overlap); % conversion from frame index to sample index
for i=1:length(soundSegment),
soundSegment(i).beginSample = frame2sampleIndex(soundSegment(i).begin, frameSize, overlap);
soundSegment(i).endSample = min(length(y), frame2sampleIndex(soundSegment(i).end, frameSize, overlap));
soundSegment(i).beginFrame = soundSegment(i).begin;
soundSegment(i).endFrame = soundSegment(i).end;
end
soundSegment=rmfield(soundSegment, 'begin');
soundSegment=rmfield(soundSegment, 'end');
% soundSegment=rmfield(soundSegment, 'duration');
end
others.volume=volume; others.volTh1=v