function [history, stopReason] = emlars(yin, xin, XTX, type, stopCriterion, regularization, trace, quiet)
% %{
%实现 "LARS"算法和它的lasso修正,参见Efron et. al.2003.的论文
%读论文理解代码的意思,代码的每一行在论文中有相关的等式
% *** CAUTION
% history(1).mu_OLS contains original 'yin' to provide convinience in
% writing user defined stop criterion function. Actual mu_OLS of the first
% step should be just mean of yin which is a simple array of copy of
% history(1).mu. So, if history(1).mu_OLS contains that information, it is
% redundant. In fact, this is also contained in history(1).b, which is a
% bias of the output. Therefore, to provide more information to user who
% want to write his/her own stop criterion function, history(1).mu_OLS
% contains yin.
% history(1).mu_OLS包含初始的'yin'在writing user定义终止准则函数时提供方便
% 实际,mu_OLS的第一步应该意味着yin,是history(1).mu.的简单阵列的复制
%因此,如果history(1).mu_OLS包含这样的信息,就会冗余
%事实上,它也被包含于 history(1).b,这是输出的偏差
%因此,为了给想编写自己停止准则函数的使用者提供更多的信息,history(1).mu_OLS包含yin
% Example 1: 中等大小的x(moderate size x)
% stopCrioterion = {};
% stopCrioterion{1,1} = 'maxKernels';
% stopCrioterion{1,2} = 100;
%
% XTX = lars_getXTX(x_original); % this takes long time.
% sol = lars(y, x, 'lasso', XTX, stopCriterion);
%
% Example 2:非常小的x,或者非常大的x(very small size x,or a really really big size x)
% stopCrioterion = {};
% stopCrioterion{1,1} = 'maxKernels';
% stopCrioterion{1,2} = 100;
%
% sol = lars(y, x, 'lasso', XTX, stopCriterion);
%
% Note:
% Users can add any kind of stop criterion by editing
% the corresponding portion of this file. See the code
% for existing examples.
% 用户可以通过编辑该文件的相应部分来添加任何类型的停止标准。
% Note 2:m文件不会为缺失数据实现编程
% This m-file does not implement routine for missing data.
% %}
%首先定义全局变量
global USING_CLUSTER;%集群
global RESOLUTION_OF_LARS;%分辨率
global REGULARIZATION_FACTOR;%正则化
lars_init();
%正则化
regularization_factor = REGULARIZATION_FACTOR; % Tikhonov regularization factor (or the ridge regression factor)
% -> This should be small enough in this case to get
% a reasonable pseudoinverse.(伪逆)
stopReason = {};
%%% Check parameters
if length(yin)==0 | length(xin)==0
warning('\nInput or Output has zero length.\n');
history.active_set = [];
stopReason{1} = 'Parameter error';
stopReason{2} = 0;
return;
end
if size(yin,1) ~= size(xin,1)
warning('\nSize of y does not match to that of x.\n');
history.active_set = [];
stopReason{1} = 'Parameter error';
stopReason{2} = 0;
return;
end
%strcmp:用于做字符串比较的函数,strcmp('hello','jm'),返回0
if ~strcmp(type, 'lasso') & ~strcmp(type, 'lars') & ~strcmp(type, 'forward_stepwise')
warning('\nUnknown type of regression.\n');
history.active_set = [];
stopReason{1} = 'Parameter error';
stopReason{2} = 0;
return;
end
if strcmp(type, 'forward_stepwise')
warning('\nForward_stepwise is not implemented.\n');
history.active_set = [];
stopReason{1} = 'Parameter error';
stopReason{2} = 0;
return;
end
%exist主要有两种形式,一个参数和两个参数的,作用都是用于确定某值是否存在:
%b = exist( a),若 a 存在,则 b = 1;否则 b = 0;
%b = exist( 'name', 'kind'),kind 表示 name 的类型,可以取的值为:builtin(内建类型)
%class(类),dir(文件夹),file(文件或文件夹),var(变量)
if exist('regularization','var') & ~isempty(regularization)
regularization = 10;
else
regularization = 0;
end
if ~exist('trace','var') | isempty(trace)
trace=0;
end
if ~exist('quiet','var') | isempty(quiet)
quiet=0;
elseif quiet==1
trace=0;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% 数据预处理
% 程序自动对预测变量进行中心化和标准化
%%%?????????????????????
%%%???????????????????????????????????????中心化和标准化的步骤呢?
% if ~exist('XTX','var')
% XTX=[];
% end
%no_xtx = 0;
% if ~quiet & trace >=0
% fprintf('\nLars is using the provided xtx.\n');
% end
% if ~isempty(XTX)
% if ~quiet & trace >=0
% fprintf('\nLars is using the provided xtx.\n');
% end
% elseif size(xin,2)^2 > 10^6
% if ~quiet & trace >=0
% fprintf('Too large matrix (size(x,2)^2 > 10^6). lars will not pre-calculate xtx.\n');
% end
% no_xtx = 1;
% XTX = lars_getXTX(xin,no_xtx);
% else
% % fprintf('\nCalculating xtx.\n');
% XTX = lars_getXTX(xin);
% end
x = xin; % 标准化 xin
n = size(x,1); % # of samples
m = size(x,2); % # of predictors
mx = zeros(1,m) ; %= XTX.mx; % mean xin
sx = ones(1,m) ; %XTX.sx; % length of each column of xin
%ignores = XTX.ignores; % indices for constant terms
all_candidate = [1:m];% indices for all possible columns
%if ~no_xtx
xtx = XTX.xtx; % xtx matrix
xty = XTX.xty;
%dup_columns = XTX.dup_columns; % duplicated columns which will be automatically ignored.
%end
my = 0;
y = yin;
% my = mean(yin);
% y = yin-my;
% Now, we can determine the maximum number of kernels.
%maxKernels = min(maxKernels, min(size(xin,1)-1, length(all_candidate)));
%maxKernels = min(maxKernels, min(rank(xin), length(all_candidate)));
existMaxKernels = 0;
existMSE = 0;
for is = 1:size(stopCriterion,1)
if strcmp(stopCriterion{is,1},'maxKernels')
existMaxKernels = 1;
% stopCriterion{is,2} = min(stopCriterion{is,2}, min(size(xin,1)-1, length(all_candidate)));
% stopCriterion{is,2} = min(stopCriterion{is,2}, min(rank(xin), length(all_candidate)));
if stopCriterion{is,2}<1
warning('Max Kernel is less than 1. It must be larger than 0.\n');
stopCriterion{is,2} = 1;
end
end
if strcmp(stopCriterion{is,1},'MSE')
existMSE = 1;
if stopCriterion{is,2}<1.0e-10
warning('Maximum MSE is too small. Automatically set to 1.0e-10\n');
stopCriterion{is,2} = 1.0e-10;
end
end
end
if ~existMaxKernels
is = size(stopCriterion,1);
stopCriterion{is+1,1} = 'maxKernels';
% stopCriterion{is+1,2} = min(size(xin,1)-1, length(all_candidate)); % Stop when size of active set is data.maxKernels.
stopCriterion{is+1,2} = min(rank(xin), length(all_candidate)); % Stop when size of active set is data.maxKernels.
end
if ~existMSE
is = size(stopCriterion,1);
stopCriterion{is+1,1} = 'MSE';
stopCriterion{is+1,2} = 1.0e-10;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% 初始化
active = []; % active set
inactive = all_candidate; % inactive set
mu_a = zeros(n,1); % current estimate (eq. 2.8)
mu_a_plus = 0; % next estimate (eq. 2.12)
mu_a_OLS = 0; % OLS estimate (eq. 2.19)
beta = zeros(1,size(x,2));
beta_new = beta;
beta_OLS = beta;
history.active_set = [];
history.add = [];
history.drop = [];
history.beta_norm = [];
history.beta = [];
history.df = [];
history.rss = [];
history.b = my;
history.mu = my;
history.beta_OLS_norm = [];
history.beta_OLS = [];
history.b_OLS = my;
history.mu_OLS = my*ones(size(yin));
history.MSE = sum(y.^2)/length(y);
history.R_square = 0;
history.resolution_warn
alvarocfc
- 粉丝: 44
- 资源: 1万+