function model = cv( X, Y, opt)
%model = cv( X, Y, opt)
%
% Cross validation by pls, pcr or PARAFAC. Leave-one-out or
% other cross validation models may be used.
%
% INPUT
% X X-array
% Y Response variables
% opt Includes the fields: nLV, met, ind, pp, fac and view.
% Type 'opt = cv;' to get the default values
%
% OUTPUT
% model A struct array with two main fields: 'Val' and 'Cal'. These
% again contains the following fields: calibration model,
% predictions, residuals in X
%
% See alos: cvind
% Copyright, 2014 -
% This M-file and the code in it belongs to the holder of the
% copyrights and is made public under the following constraints:
% It must not be changed or modified and code cannot be added.
% The file must be regarded as read-only.
% In case of doubt, contact the holder of the copyrights.
%
% �smund Rinnan
% E-mail asmundrinnan@gmail.com
if nargin == 0 || nargin < 3
model.nLV = 10;
model.class = false;
model.method = 'pls';
model.weights.X = NaN;
model.weights.Y = NaN;
model.Val.ind = NaN;
model.Val.rep = NaN;
model.xpp.met{1} = 'mc';
model.xpp.set(1).ref = NaN;
model.xpp.set(1).ax = NaN;
model.xpp.set(1).set = NaN;
model.ypp.met{1} = 'mc';
model.ypp.set(1).ref = NaN;
model.ypp.set(1).ax = NaN;
model.ypp.set(1).set = NaN;
model.fac = NaN;
model.view = 1;
model.info = {'nLV - Number of Latent Variables to be used'; ...
'class - false for continous data, true for classification problems'; ...
'method can be ''PLS'', ''WPLS'', ''PARAFAC'', ''PCR'' or ''N-PLS'''; ...
'Val - ind: Index for cross-validation (see ''cvind'' or cvpos''';...
' rep: vector of sample numbers (i.e. replicates have same number';...
'xpp - Pre-processing of X. It will always be done in the following order:'; ...
' SNV/ MSC, followed by SG, followed by scaling and centering'; ...
' If multiple pp is wanted, give them in cells.'; ...
'ypp - As above but for Y.'; ...
'fac - Initial guesses of the PARAFAC factors.'; ...
'view - 1 = show progress, anything else = don''t show progress'};
if nargin == 0
return
end
end
%Check if X and Y has same number of rows
n = size(X);
[rY,kY]=size(Y);
if n(1)~=rY
error('''X'' and ''Y'' should have the same number of samples.')
end
if any( isnan( Y) )
error( 'You have missing values in y, remove these prior to running ''cv''')
end
%Reduce the number of max factors in case size(X) is small
if opt.nLV>min(size(X))
opt.nLV = min(size(X));
end
%Check if the method is valid
opt.method = lower( opt.method);
if sum( strcmp( opt.method, {'pcr', 'pls', 'parafac', 'npls', 'wpls'}) ) == 0
error( [opt.method ' is not a recognized method. Select between: pcr, pls, parafac, npls and wpls'])
end
if isnan( opt.Val.rep)
opt.Val.rep = ( 1:n(1) )';
end
%The default value of 'pos'
if isnan( opt.Val.ind)
opt.Val.ind = cvind( y, opt.Val.rep, min( n(1),20), true( 1, 2));
end
%In case of PLS-DA and the Y-variable is just given as a classindex rather
%than a dummy variable
if opt.class && size( Y, 2) == 1 && length( fjernlike( Y( :, 1) ) ) > 2
[~, yi] = fjernlike( Y);
yc = (1:size( yi, 1) )' * ones( 1, size( yi, 2) );
yc( isnan( yi) ) = NaN;
ytemp = zeros( size( Y) );
yi = vec( yi');
yc = vec( yc');
yi( isnan( yi) ) = [];
yc( isnan( yc) ) = [];
ytemp( yi) = yc;
Y = full( sparse( (1:size( Y, 1) )', ytemp, true( size( Y, 1), 1) ) );
end
%If fac is not given, and method is PARAFAC, this is calculated
if strcmp( 'par', opt.method)
if isnan( opt.fac)
opt.fac = parafac(X,nLV,[0 0 0 0 -1]);
end
end
opt.view = opt.view == 1;
%There may be columns in X or Y with zero variance
if any( var( Y) == 0) || any( var( X) == 0)
error( 'Some of your variables have zero variance! Remove them, and run again')
end
%Initialize the prediction of Y
ypred = zeros( rY, opt.nLV);
%Start the waiting bar
if opt.view
h = waitbar(0,['Cross validation is now running using ' opt.method]);
set(h,'units','normalized');
set(h,'position',[.4 .6 .3 .075]);
end
%Initialize the output struct
model.Cal=struct('Yp',[],'rms',[],'Xres',[],'B',[],'T',[],'P',[],'U',[],'C',[],'W',[]);
model.Val=struct('Yp',[],'rms',[],'Xres',[],'B',[],'T',[], 'U', []);
model.Org.Y = Y;
model.Org.rep = opt.Val.rep;
%Pre-processing order
for pp = {'xpp', 'ypp'}
temp = vec( opt.(pp{1}).met);
ind = zeros( 1, 3);
ind( 1) = max( [0; find( strcmp( temp, 'snv') + strcmp( temp, 'msc') )] );
ind( 2) = max( [0; find( strcmp( temp, 'sg') )] );
ind( 3) = max( [0; find( strcmp( temp, 'mc') + strcmp( temp, 'as') + strcmp( temp, 'ps') )] );
ind( ind == 0) = [];
opt.(pp{1}).met = opt.(pp{1}).met( ind);
opt.(pp{1}).set = opt.(pp{1}).set( ind);
end
%Indicator if there are missing values in the original X-matrix
test = isnan( sum( vec( X) ) );
%The cross-validation itself
ur = fjernlike( opt.Val.ind);
for ci = 1:length( ur)
if opt.view == 1
waitbar( ci/length(ur),h)
end
Xc = reshape( X( opt.Val.ind ~= ur(ci), :), [sum( opt.Val.ind ~= ur(ci) ) n(2:end)]);
Xp = reshape( X( opt.Val.ind == ur(ci), :), [sum( opt.Val.ind == ur( ci) ) n(2:end)]);
Yc = Y( opt.Val.ind ~= ur( ci), :);
Yp = Y( opt.Val.ind == ur( ci), :);
%The part model and prediction is performed in a separate function
mod = regres( Xc, Xp, Yc, Yp, opt);
switch opt.method
case 'pls'
if isempty( mod) %Check if any model has been made
error( 'There are several samples where the variable(s) do not vary')
else
%Check if this is a classification problem, and transform the predictions
%into classifications
if opt.class
if size( Y, 2) > 1 %Multi-class problem
ypclass = zeros( size( mod.yp{1}) );
for cf = 1:size( mod.yp{1}, 2)
ypred = zeros( size( mod.yp{1}, 1), length( mod.yp) );
for cc = 1:length( mod.yp)
ypred( :, cc) = mod.yp{cc}( :, cf);
end
[~, j] = min( abs( 1 - ypred), [], 2 ); %Find the column closest to one
%030212 AAR Doesn't seem like it makes sense to set some to non-classified.
% Tested on fish data
% ypred = sort( ypred, 2);
% j( ypred( :, end) - ypred( :, end-1) < .025) = NaN;
ypclass( :, cf) = j;
end
else %Two class problem
% error( 'Should calculate a ROC-curve and thus get a better result!')
ypclass = mod.yp > sum( Yc == 0)/ length( Yc);
%300812 AAR Changed the classification limit from
% 0.5 to be dependent on the number of
% samples in each group
end
mod.yp = ypclass;
end
if ~isnan( sum( vec( Xc) ) ) && test && ~iscell( mod.Tp)
mod.Tp = mat2cell( mod.Tp, size( mod.Tp, 1), ones( size( mod.Tp, 2), 1) );
mod.Up = mat2cell( mod.Up, size( mod.Up, 1), ones( size( mod.Up, 2), 1) );
end
if test
model.Val.Yp( opt.Val.ind == ur( ci), 1:size( mod.yp, 2) ) = mod.yp;
for cf = 1:size( mod.B, 2)
%AAR 091110 Two lines