%**************************************************************
%* mex interface to Andy Liaw et al.'s C code (used in R package randomForest)
%* Added by Abhishek Jaiantilal ( abhishek.jaiantilal@colorado.edu )
%* License: GPLv2
%* Version: 0.02
%
% Calls Regression Random Forest
% A wrapper matlab file that calls the mex file
% This does training given the data and labels
% Documentation copied from R-packages pdf
% http://cran.r-project.org/web/packages/randomForest/randomForest.pdf
% Tutorial on getting this working in tutorial_ClassRF.m
%%**************************************************************
% function model = classRF_train(X,Y,ntree,mtry, extra_options)
%
% ___Options
% requires 2 arguments and the rest 3 are optional
% X: data matrix
% Y: target values
% ntree (optional): number of trees (default is 500). also if set to 0
% will default to 500
% mtry (default is floor(sqrt(size(X,2))) D=number of features in X). also if set to 0
% will default to 500
%
%
% Note: TRUE = 1 and FALSE = 0 below
% extra_options represent a structure containing various misc. options to
% control the RF
% extra_options.replace = 0 or 1 (default is 1) sampling with or without
% replacement
% extra_options.strata = (not Implemented)
% extra_options.sampsize = Size(s) of sample to draw. For classification,
% if sampsize is a vector of the length the number of strata, then sampling is stratified by strata,
% and the elements of sampsize indicate the numbers to be drawn from the strata. I don't yet know how this works.
% extra_options.nodesize = Minimum size of terminal nodes. Setting this number larger causes smaller trees
% to be grown (and thus take less time). Note that the default values are different
% for classification (1) and regression (5).
% extra_options.importance = Should importance of predictors be assessed?
% extra_options.localImp = Should casewise importance measure be computed? (Setting this to TRUE will
% override importance.)
% extra_options.proximity = Should proximity measure among the rows be calculated?
% extra_options.oob_prox = Should proximity be calculated only on 'out-of-bag' data?
% extra_options.do_trace = If set to TRUE, give a more verbose output as randomForest is run. If set to
% some integer, then running output is printed for every
% do_trace trees.
% extra_options.keep_inbag = Should an n by ntree matrix be returned that keeps track of which samples are
% 'in-bag' in which trees (but not how many times, if sampling with replacement)
% extra_options.corr_bias = which happens only for regression. perform bias correction for regression? Note: Experimental. Use at your own
% risk.
% extra_options.nPerm = Number of times the OOB data are permuted per tree for assessing variable
% importance. Number larger than 1 gives slightly more stable estimate, but not
% very effective. Currently only implemented for regression.
%
%
% ___Returns model which has
% importance = a matrix with nclass + 2 (for classification) or two (for regression) columns.
% For classification, the first nclass columns are the class-specific measures
% computed as mean decrease in accuracy. The nclass + 1st column is the
% mean decrease in accuracy over all classes. The last column is the mean decrease
% in Gini index. For Regression, the first column is the mean decrease in
% accuracy and the second the mean decrease in MSE. If importance=FALSE,
% the last measure is still returned as a vector.
% importanceSD = The ?standard errors? of the permutation-based importance measure. For classification,
% a p by nclass + 1 matrix corresponding to the first nclass + 1
% columns of the importance matrix. For regression, a length p vector.
% localImp = a p by n matrix containing the casewise importance measures, the [i,j] element
% of which is the importance of i-th variable on the j-th case. NULL if
% localImp=FALSE.
% ntree = number of trees grown.
% mtry = number of predictors sampled for spliting at each node.
% votes (classification only) a matrix with one row for each input data point and one
% column for each class, giving the fraction or number of ?votes? from the random
% forest.
% oob_times number of times cases are 'out-of-bag' (and thus used in computing OOB error
% estimate)
% proximity if proximity=TRUE when randomForest is called, a matrix of proximity
% measures among the input (based on the frequency that pairs of data points are
% in the same terminal nodes).
% errtr = first column is OOB Err rate, second is for class 1 and so on
% mse =(regression only) vector of mean square errors: sum of squared residuals divided
% by n.
% rsq (regression only) 'pseudo R-squared': 1 - mse / Var(y).
function model = regRF_train(p_train, t_train, ntree, mtry, extra_options)
% requires 2 arguments and the rest 2 are optional
% p_train: data matrix
% t_train: target values
% ntree (optional): number of trees (default is 500)
% mtry (default is max(floor(D / 3), 1) D = number of features in X)
DEBUG_ON = 0;
DEFAULTS_ON = 0;
TRUE = 1;
FALSE = 0;
if exist('extra_options', 'var')
if isfield(extra_options, 'DEBUG_ON'); DEBUG_ON = extra_options.DEBUG_ON; end
if isfield(extra_options, 'replace'); replace = extra_options.replace; end
% if isfield(extra_options, 'classwt'); classwt = extra_options.classwt; end
% if isfield(extra_options, 'cutoff'); cutoff = extra_options.cutoff; end
% if isfield(extra_options, 'strata'); strata = extra_options.strata; end
if isfield(extra_options, 'sampsize'); sampsize = extra_options.sampsize; end
if isfield(extra_options, 'nodesize'); nodesize = extra_options.nodesize; end
if isfield(extra_options, 'importance'); importance = extra_options.importance; end
if isfield(extra_options, 'localImp'); localImp = extra_options.localImp; end
if isfield(extra_options, 'nPerm'); nPerm = extra_options.nPerm; end
if isfield(extra_options, 'proximity'); proximity = extra_options.proximity; end
if isfield(extra_options, 'oob_prox'); oob_prox = extra_options.oob_prox; end
% if isfield(extra_options, 'norm_votes'); norm_votes = extra_options.norm_votes; end
if isfield(extra_options, 'do_trace'); do_trace = extra_options.do_trace; end
if isfield(extra_options, 'corr_bias'); corr_bias = extra_options.corr_bias; end
if isfield(extra_options, 'keep_inbag'); keep_inbag = extra_options.keep_inbag; end
end
% set defaults if not already set
if ~exist('DEBUG_ON', 'var'); DEBUG_ON=FALSE; end
if ~exist('replace', 'var'); replace = TRUE; end
% if ~exist('classwt', 'var'); classwt = []; end % will handle these three later
% if ~exist('cutoff', 'var'); cutoff = 1; end
% if ~exist('strata', 'var'); strata = 1; end
if ~exist('sampsize', 'var')
if (replace)
sampsize = size(p_train, 1);
else
sampsize = ceil(0.632 * size(p_train, 1));
end
end
if ~exist('nodesize', 'var'); nodesize = 5; end % classification = 1, regression = 5
if ~exist('importance', 'var'); importance = FALSE; end
if ~exist('localImp', 'var'); localImp = FALSE; end
if ~exist('nPerm', 'var'); nPerm = 1; end
% if ~exist('proximity', 'var'); proximity = 1; end % will handle these two later
% if ~exist('oob_prox', 'var'); oob_prox = 1; end
% if ~exi
基于灰狼算法优化随机森林(GWO-RF)的时间序列预测,GWO-RF时间序列预测 模型评价指标包括:R2、MAE、MSE、R
需积分: 0 45 浏览量
更新于2023-09-21
收藏 53KB ZIP 举报
在时间序列预测领域,结合了灰狼算法(GWO)和随机森林(RF)的GWO-RF模型是一个高效的方法。灰狼算法是一种优化技术,灵感来源于灰狼社会的狩猎行为,它能够在全球搜索最优解,而随机森林则是一种集成学习方法,通过构建并结合多个决策树来提高预测准确性。以下将详细介绍这两个核心概念以及它们在时间序列预测中的应用。
让我们了解一下灰狼算法(GWO)。GWO是由四种主要操作组成的:追捕、探索、攻击和逃避。在解决优化问题时,算法模拟灰狼群的行为来寻找最佳解决方案,即“阿尔法”(α)、“贝塔”(β)和“德尔塔”(δ)狼的位置,它们分别代表最优解、次优解和第三优解。通过不断迭代更新,算法能够在复杂空间中找到全局最优解,这对于调整随机森林参数非常有用。
随机森林(RF)是基于决策树的集成学习模型。它通过构建多棵决策树并取其平均预测结果来减少过拟合风险。在RF中,每棵树都是独立生成的,使用不同的随机子集(bootstrap sample)训练数据和特征。预测时,RF会综合所有决策树的预测结果,通常采用多数投票或平均值策略。
在GWO-RF模型中,灰狼算法用于优化随机森林的关键参数,如树的数量、每个节点分裂的最大特征数等。通过GWO,可以寻找最优的参数组合,使得随机森林在训练数据上的表现最佳,从而提升预测效果。
模型评价指标对于评估预测性能至关重要。在描述中提到了R2(决定系数)、MAE(均方误差)、MSE(均方误差)、RMSE(均方根误差)和MAPE(平均绝对百分比误差)。这些指标分别衡量了模型预测值与实际值之间的相关性、误差的大小以及相对于实际值的相对误差。高R2值表示模型解释了数据的大部分变异,低MAE、MSE和RMSE表明误差较小,而低MAPE则意味着模型的预测精度较高。
压缩包内的文件名揭示了实现GWO-RF模型的步骤:
1. `regRF_train.m`:这可能是训练随机森林模型的脚本。
2. `GWO.m`:灰狼算法的核心实现。
3. `main.m`:主程序,调用其他函数进行模型训练和预测。
4. `fun.m`:可能包含了模型评价函数。
5. `initialization.m`:初始化灰狼算法的参数。
6. `regRF_predict.m`:用于模型预测的函数。
7. `data_process.m`:数据预处理脚本,可能包括数据清洗、归一化等。
8. `mexRF_train.mexw64` 和 `mexRF_predict.mexw64`:这两个是编译后的可执行文件,可能是加速随机森林训练和预测的C/C++代码。
9. `windspeed.xls`:示例数据集,可能包含风速时间序列数据,用于模型训练和测试。
通过这样的模型和代码,研究者和开发者可以学习如何结合两种算法,以提高时间序列预测的准确性和效率。同时,由于代码质量高,用户可以方便地替换自己的数据进行预测分析。