function [w1,b1,w2,b2,w3,b3,tr,rq] = lttr3(w1,b1,f1,w2,b2,f2,w3,b3,f3,...
xc,P,T,VA,VAT,TE,TET,TP)
%LTTR3 Trains a large feed-forward network with one hidden layer
%using a truncated Gauss-Newton method on a Tikhonov regularized problem.
%
%The general design of LTTR3 is much the same as the design of the
%functions in the Neural Network (NN) Toolbox from MathWorks even though the
%calling sequences differ. See Neural Network Toolbox User's Guide from the
%MathWorks Inc.
%
%LTTR3 is intended for problems so large that explicit storing of
%the corresponding Jacobian matrix is impossible or inconvenient.
%For smaller problems where the Jacobian can be stored in main memory, there
%is a function TTR3 that uses the Gauss-Newton method (not truncated)
%on a Tikhonov regularized problem. However, even for smaller problems LTTR3
%is often to be prefered to TTR3 (not to mention the Levenberg-Marquardt
%implementations in the NN Toolbox) since it is normally much faster.
%
%You can find more details in two papers by Eriksson J., Gulliksson M.,
%Lindstr�m P. and Wedin P-�.: "Regularization tools for training
%feed-forward neural networks, Part I: Theory and basic algorithms, Part II:
%Large-scale problems". Both papers are available in post-script form via
%http://www.cs.umu.se/~perl/research.html
%
%Calling sequence:
% [W1,B1,W2,B2,W3,B3,TR,RQ] =LTTR3(W1,B1,'F1',W2,B2,'F2',W3,B3,'F3',...
% XC,P,T,VA,VAT,TE,TET,TP)
% Wi - Weight matrix of ith layer.
% Bi - Bias vector of ith layer.
% F - Transfer function (string) of ith layer.
% XC - Center point for the weights and biases
% P - RxQ matrix of input vectors.
% T - S3xQ matrix of target vectors.
% VA - matrix of validation set
% VAT - matrix of target values corresponding to VA
% TE - matrix of test set
% TET - matrix of target values corresponding to TE
% TP - Training parameters (optional).
%
% Training parameters are:
% TP(1) - #epochs between displays.
% striplen=max(TP(1),5), default = 5
% TP(2) - Maximum number of epochs to train, default = 50
% TP(3) - Sum-squared error goal, default = 0.01
% TP(4) - Generalization loss threshold (%), default = 5
% TP(5) - Initial value of MU, default = 0.1
% TP(6) - Minimum value of MU, default = 0.001
% TP(7) - controls termination criteria. Code is two digets XY.
% X controls outer termination criterion. =1 absolute criterion
% on error function is used, =2 relative difference criterion
% on error function is used, >2 early stopping is used
% (default=3)
% Y controls inner criterion used when the CG-method is used
% to compute the search direction. =1 original criteria from
% LSQR is used, =2 Dembo et al criteria is used, =3 Jerry's
% criterion is used (default=2)
% TP(8) - controls linesearch (In fact there are no alternatives.
% This parameter is not used by the code. The default is
% always used)
% =1 parabola in R(m) is used, =2 curvelinear search is used
% (default=1)
% TP(9) - controls method to compute search direction
% =2 unscaled CG, =3 diagonal scaled CG
% (default =2)
%
% Missing parameters and NaN's are replaced with defaults.
%
% Returns:
% Wi - new weights.
% Bi - new biases.
% TR - resulting matrix containing a row of information corresponding
% to each epoch (iteration).
% Each row contains
% "Fsq MU relquot E(va) norm(dx) progress/1000 alpha"(see below)
% RQ - resulting quantities (see below)
%
% Resulting quantities are: (at iteration k, k=1,2,...,size(TR,1))
% TR(k,1) - Fsq: the objective function
% TR(k,2) - MU: the regularization parameter
% TR(k,3) - relquot: relative convergence quantity
% TR(k,4) - E(va): value of the objective w.r.t. the validation set
% TR(k,5) - norm(dx): 2-norm of the search direction
% TR(k,6) - progress/1000: one of early stopping criteria quantity
% TR(k,7) - alpha: steplength along the search direction
%
% RQ(1) - total number of iterations (epochs)
% RQ(2) - total number of net evaluations (funcev)
% RQ(3) - elapsed time in computing searchdirection
% RQ(4) - #total inner iterations in computing search direction
% RQ(5) - error for the validation set (min(E(va)))
% RQ(6) - error for the test set at the point where min(E(va)) occurs
% RQ(7) - number of epochs until min(E(va))) occurs
% RQ(8) - total time (excluding plotting) inside lttr3.m %
% Per Lindstrom, Computing Science, Umea University, Sweden
% email: perl@cs.umu.se
% Revision: 1.0
%
if nargin < 16
error('LTTR3: Not enough arguments.')
end
% TRAINING PARAMETERS
if nargin == 16, TP = []; end
TP = nndef(TP,[5 50 0.01 5 0.1 0.001 32 1 2]);
striplen = TP(1);
strip=max(TP(1),5);
me = TP(2);
eg = TP(3);
GLthresh= TP(4);
mu_min=TP(6);
mu_init = max([TP(5) mu_min]);
opttest=fix(TP(7)/10);
inner_term=fix(TP(7)-opttest*10);
method=TP(9); %=2 for unscaled CG, =3 for diag.scaled CG
optind=0;
%DEFINE TRANSFER FUNCTIONS
df1 = feval(f1,'delta');
df2 = feval(f2,'delta');
df3 = feval(f3,'delta');
% DEFINE SIZES
[r,q]=size(P);
[s1,r] = size(w1);
[s2,s1] = size(w2);
[s3,s2]=size(w3);
ext_p = nncpyi(P,s3);
% INITIALIZE
totev=0; alpha=1.0;
succesive=0;
GLstop=0; Succstop=0; Quotstop=0;
stopind=0;
mu=mu_init;
n=s1*r+s1+s1*s2+s2+s3*s2+s3; %Total number of unknowns
xc=xc(:); %Make sure it is a column vector
[mvat,nvat]=size(VAT);
nrmb1=1;
nrmfaug=1;
% PRESENTATION PHASE
[a1,a2,a3] = simuff(P,w1,b1,f1,w2,b2,f2,w3,b3,f3);
e = a3-T;
SSE = sumsqr(e);
totev=totev+1;
f=e(:);
x=[w1(:); b1(:); w2(:); b2(:); w3(:); b3(:)]; %Pack as one column vector
Fsq=0.5*(SSE+mu^2*(x-xc)'*(x-xc));
% DEFINE DERIVATIVES CORRESPONDING TO LAYER 1, 2 AND 3
deriv1=feval(df1,a1);
deriv2=feval(df2,a2);
deriv3=feval(df3,a3);
% EVALUATE USING VALIDATION SET
[va1,va2,va3]=simuff(VA,w1,b1,f1,w2,b2,f2,w3,b3,f3);
Eva=sumsqr(va3-VAT);
Eopt=Eva;
evaW1=w1; evaB1=b1;
evaW2=w2; evaB2=b2;
evaW3=w3; evaB3=b3;
Eva_last=Eva;
Etr=Fsq;
minEtr=Etr;
stripsum=0;
% TRAINING RECORD
tr(1,1:7)=zeros(1,7);
tr(1,1:4)= [Fsq mu 0 Eva];
% PLOTTING
clg
message = sprintf('TRAINLTR: %%g/%g epochs, mu = %%g, Fsq = %%g.\n',me);
fprintf(message,0,mu_init,Fsq)
h = ploterr(tr(1,1),eg);
dxItid=0;
dxDtid=0;
Unscaled_iter=0;
Diagonal_scaled_iter=0;
itn_count=0;
totinner=0;
Fdiff=1;
if inner_term == 3
itnlim=5;
else
itnlim=n;
end
tottime=0;
evaiter=0;
%MAIN LOOP
for i=1:me
% GENERALIZATION LOSS
time=cputime;
GL=(Eva/Eopt-1);
tr(i,5:6)=[0 GL];
stripsum=stripsum+Etr;
minEtr=min(minEtr,Etr);
if (GL > GLthresh/100) & (optind == 0)
optind=1;
end
% CHECK PHASE
if SSE < eg, i=i-1; stopind=6; break, end
if opttest == 2
%if Fdiff < 1e-3, i=i-1; stopind=7; break, end
if nrmb1 < 5e-3*nrmfaug, i=i-1; stopind=5; break, end
end
if (opttest > 2) & (rem(i,strip) == 0)
progress=(stripsum/strip/minEtr-1);
tr(i,6)=progress;
stripsum=0;
minEtr=inf;
if (progress < 0.001), i=i-1; stopind=1; break, end
if (GL > 2*GLthresh/100), i=i-1; stopind=2; break; end
if (GLstop == 0) & (GL>GLthresh/100)
GLstop=1;
end
if Eva > Eva_last
succesive=succesive+1;
else
succesive=0;
end
if succesive == 8
Succstop=1;
end
if (GL/progress*10) > 3
Quotstop=1;
end
if (GLstop+Succstop+Quotstop) == 3, i=i-1; stopind=3; break, end
end
tottime=cputime-time+tottime;
if method == 3
% Find the length of the columns in Jaug
% Store 1/length (provided the length isn't zero) in diagonal scaling matix D
%
time=cputime;
ettor=ones(q*s3,1);
D=Jsquared3(2,q*s3,n,ettor,1,1,w1,b1,a1,deriv1,w2,b2,a2,deriv2,w3,b3,...
a3,deriv3,P,T);
D=sqrt(D+mu*mu);
D=~D + D;
D= 1 ./ D;
tottime=cputime-time+tottime;
e