function [f0_time,f0_value,SHR,f0_candidates]=shrp(Y,Fs,F0MinMax,frame_length,timestep,SHR_Threshold,ceiling,med_smooth,CHECK_VOICING)
% SHRP - a pitch determination algorithm based on Subharmonic-to-Harmonic Ratio (SHR)
% [f0_time,f0_value,SHR,f0_candidates]=shrp(Y,Fs[,F0MinMax,frame_length,TimeStep,SHR_Threshold,Ceiling,med_smooth,CHECK_VOICING])
%
% Input parameters (There are 9):
%
% Y: Input data
% Fs: Sampling frequency (e.g., 16000 Hz)
% F0MinMax: 2-d array specifies the F0 range. [minf0 maxf0], default: [50 550]
% Quick solutions:
% For male speech: [50 250]
% For female speech: [120 400]
% frame_length: length of each frame in millisecond (default: 40 ms)
% TimeStep: Interval for updating short-term analysis in millisecond (default: 10 ms)
% SHR_Threshold: Subharmonic-to-harmonic ratio threshold in the range of [0,1] (default: 0.4).
% If the estimated SHR is greater than the threshold, the subharmonic is regarded as F0 candidate,
% Otherwise, the harmonic is favored.
% Ceiling: Upper bound of the frequencies that are used for estimating pitch. (default: 1250 Hz)
% med_smooth: the order of the median smoothing (default: 0 - no smoothing);
% CHECK_VOICING: check voicing. Current voicing determination algorithm is kind of crude.
% 0: no voicing checking (default)
% 1: voicing checking
% Output parameters:
%
% f0_time: an array stores the times for the F0 points
% f0_value: an array stores F0 values
% SHR: an array stores subharmonic-to-harmonic ratio for each frame
% f0_candidates: a matrix stores the f0 candidates for each frames, currently two f0 values generated for each frame.
% Each row (a frame) contains two values in increasing order, i.e., [low_f0 higher_f0].
% For SHR=0, the first f0 is 0. The purpose of this is that when you want to test different SHR
% thresholds, you don't need to re-run the whole algorithm. You can choose to select the lower or higher
% value based on the shr value of this frame.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Permission to use, copy, modify, and distribute this software without fee is hereby granted
% FOR RESEARCH PURPOSES only, provided that this copyright notice appears in all copies
% and in all supporting documentation.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
%
% For details of the algorithm, please see
% Sun, X.,"Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio" To appear in the Proc. of ICASSP2002, Orlando, Florida, May 13 -17, 2002.
% For update information, please check http://mel.speech.nwu.edu/sunxj/pda.htm.
%
% Copyright (c) 2001 Xuejing Sun
% Department of Communication Sciences and Disorders
% Northwestern University, USA
% sunxj@northwestern.edu
%
% Update history:
% Added "f0_candidates" as a return value, Dec. 21, 2001
% Changed default median smoothing order from 5 to 0, Jan. 9, 2002
% Modified the GetLogSpectrum function, bug fixed due to Herbert Griebel. Jan. 15, 2002
% Several minor changes. Jan. 15,2002.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%t0 = clock;
%------------------ Get input arguments values and set default values -------------------------
if nargin<9
CHECK_VOICING=0;
end
if nargin<8
med_smooth=0;
end
if nargin<7
ceiling=1250;
end
if nargin<6
SHR_Threshold=0.4; % subharmonic to harmonic ratio threshold
end
if nargin<5
timestep=10;
%timestep=6.4;
end
if nargin<4
frame_length=40; % default 40 ms
end
if nargin<3
minf0=50;
maxf0=500;
else
minf0=F0MinMax(1);
maxf0=F0MinMax(2);
end
if nargin<2
error('Sampling rate must be supplied!')
end
segmentduration=frame_length;
%------------------- pre-processing input signal -------------------------
Y=Y-mean(Y); % remove DC component
Y=Y/max(abs(Y)); %normalization
total_len=length(Y);
%------------------ specify some algorithm-specific thresholds -------------------------
interpolation_depth=0.5; % for FFT length
%--------------- derived thresholds specific to the algorithm -------------------------------
maxlogf=log2(maxf0/2);
minlogf=log2(minf0/2); % the search region to compute SHR is as low as 0.5 minf0.
N=floor(ceiling/minf0); % maximum number harmonics
m=mod(N,2);
N=N-m;
N=N*4; %In fact, in most cases we don't need to multiply N by 4 and get equally good results yet much faster.
% derive how many frames we have based on segment length and timestep.
segmentlen=round(segmentduration*(Fs/1000));
inc=round(timestep*(Fs/1000));
nf = fix((total_len-segmentlen+inc)/inc);
n=(1:nf);
f0_time=((n-1)*timestep+segmentduration/2)'; % anchor time for each frame, the middle point
%f0_time=((n-1)*timestep)'; % anchor time for each frame, starting from zero
%------------------ determine FFT length ---------------------
fftlen=1;
while (fftlen < segmentlen * (1 +interpolation_depth))
fftlen =fftlen* 2;
end
%----------------- derive linear and log frequency scale ----------------
frequency=Fs*(1:fftlen/2)/fftlen; % we ignore frequency 0 here since we need to do log transformation later and won't use it anyway.
limit=find(frequency>=ceiling);
limit=limit(1); % only the first is useful
frequency=frequency(1:limit);
logf=log2(frequency);
%% clear some variables to save memory
clear frequency;
min_bin=logf(end)-logf(end-1); % the minimum distance between two points after interpolation
shift=log2(N); % shift distance
shift_units=round(shift/min_bin); %the number of unit on the log x-axis
i=(2:N);
% ------------- the followings are universal for all the frames ---------------%%
startpos=shift_units+1-round(log2(i)/min_bin); % find out all the start position of each shift
index=find(startpos<1); % find out those positions that are less than 1
startpos(index)=1; % set them to 1 since the array index starts from 1 in matlab
interp_logf=logf(1):min_bin:logf(end);
interp_len=length(interp_logf);% new length of the amplitude spectrum after interpolation
totallen=shift_units+interp_len;
endpos=startpos+interp_len-1; %% note that : totallen=shift_units+interp_len;
index=find(endpos>totallen);
endpos(index)=totallen; % make sure all the end positions not greater than the totoal length of the shift spectrum
newfre=2.^(interp_logf); % the linear Hz scale derived from the interpolated log scale
upperbound=find(interp_logf>=maxlogf); % find out the index of upper bound of search region on the log frequency scale.
upperbound=upperbound(1);% only the first element is useful
lowerbound=find(interp_logf>=minlogf); % find out the index of lower bound of search region on the log frequency scale.
lowerbound=lowerbound(1);
%----------------- segmentation of speech ------------------------------
curpos=round(f0_time/1000*Fs); % position for each frame in terms of index, not time
frames=toframes(Y,curpos,segmentlen,'hamm');
[nf framelen]=size(frames);
clear Y;
%----------------- initialize vectors for f0 time, f0 values, and SHR
f0_value=zeros(nf,1);
SHR=zeros(nf,1);
f0_time=f0_time(1:nf);
f0_candidates=zeros(nf,2);
%----------------- voicing determination ----------------------------
if (CHECK_VOICING)
NoiseFloor=sum(frames(1,:).^2);
voicing=vda(frames,segmentduration/1000,NoiseFloor);
else
voicing=ones(nf,1);
end
%------------------- the main loop -----------------------
cu