/*****************************************************************
Copyright (C) 2001-2012 Leo Breiman, Adele Cutler and Merck & Co., Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
C driver for Breiman & Cutler's random forest code.
Re-written from the original main program in Fortran.
Andy Liaw Feb. 7, 2002.
Modifications to get the forest out Matt Wiener Feb. 26, 2002.
*****************************************************************/
#include <R.h>
#include <R_ext/Utils.h>
#include "rf.h"
void oob(int nsample, int nclass, int *jin, int *cl, int *jtr,int *jerr,
int *counttr, int *out, double *errtr, int *jest, double *cutoff);
void TestSetError(double *countts, int *jts, int *clts, int *jet, int ntest,
int nclass, int nvote, double *errts,
int labelts, int *nclts, double *cutoff);
/* Define the R RNG for use from Fortran. */
void F77_SUB(rrand)(double *r) { *r = unif_rand(); }
void classRF(double *x, int *dimx, int *cl, int *ncl, int *cat, int *maxcat,
int *sampsize, int *strata, int *Options, int *ntree, int *nvar,
int *ipi, double *classwt, double *cut, int *nodesize,
int *outcl, int *counttr, double *prox,
double *imprt, double *impsd, double *impmat, int *nrnodes,
int *ndbigtree, int *nodestatus, int *bestvar, int *treemap,
int *nodeclass, double *xbestsplit, double *errtr,
int *testdat, double *xts, int *clts, int *nts, double *countts,
int *outclts, int *labelts, double *proxts, double *errts,
int *inbag) {
/******************************************************************
* C wrapper for random forests: get input from R and drive
* the Fortran routines.
*
* Input:
*
* x: matrix of predictors (transposed!)
* dimx: two integers: number of variables and number of cases
* cl: class labels of the data
* ncl: number of classes in the response
* cat: integer vector of number of classes in the predictor;
* 1=continuous
* maxcat: maximum of cat
* Options: 7 integers: (0=no, 1=yes)
* add a second class (for unsupervised RF)?
* 1: sampling from product of marginals
* 2: sampling from product of uniforms
* assess variable importance?
* calculate proximity?
* calculate proximity based on OOB predictions?
* calculate outlying measure?
* how often to print output?
* keep the forest for future prediction?
* ntree: number of trees
* nvar: number of predictors to use for each split
* ipi: 0=use class proportion as prob.; 1=use supplied priors
* pi: double vector of class priors
* nodesize: minimum node size: no node with fewer than ndsize
* cases will be split
*
* Output:
*
* outcl: class predicted by RF
* counttr: matrix of votes (transposed!)
* imprt: matrix of variable importance measures
* impmat: matrix of local variable importance measures
* prox: matrix of proximity (if iprox=1)
******************************************************************/
int nsample0, mdim, nclass, addClass, mtry, ntest, nsample, ndsize,
mimp, nimp, near, nuse, noutall, nrightall, nrightimpall,
keepInbag, nstrata;
int jb, j, n, m, k, idxByNnode, idxByNsample, imp, localImp, iprox,
oobprox, keepf, replace, stratify, trace, *nright,
*nrightimp, *nout, *nclts, Ntree;
int *out, *bestsplitnext, *bestsplit, *nodepop, *jin, *nodex,
*nodexts, *nodestart, *ta, *ncase, *jerr, *varUsed,
*jtr, *classFreq, *idmove, *jvr,
*at, *a, *b, *mind, *nind, *jts, *oobpair;
int **strata_idx, *strata_size, last, ktmp, nEmpty, ntry;
double av=0.0, delta=0.0;
double *tgini, *tx, *wl, *classpop, *tclasscat, *tclasspop, *win,
*tp, *wr;
addClass = Options[0];
imp = Options[1];
localImp = Options[2];
iprox = Options[3];
oobprox = Options[4];
trace = Options[5];
keepf = Options[6];
replace = Options[7];
stratify = Options[8];
keepInbag = Options[9];
mdim = dimx[0];
nsample0 = dimx[1];
nclass = (*ncl==1) ? 2 : *ncl;
ndsize = *nodesize;
Ntree = *ntree;
mtry = *nvar;
ntest = *nts;
nsample = addClass ? (nsample0 + nsample0) : nsample0;
mimp = imp ? mdim : 1;
nimp = imp ? nsample : 1;
near = iprox ? nsample0 : 1;
if (trace == 0) trace = Ntree + 1;
tgini = (double *) S_alloc(mdim, sizeof(double));
wl = (double *) S_alloc(nclass, sizeof(double));
wr = (double *) S_alloc(nclass, sizeof(double));
classpop = (double *) S_alloc(nclass* *nrnodes, sizeof(double));
tclasscat = (double *) S_alloc(nclass*32, sizeof(double));
tclasspop = (double *) S_alloc(nclass, sizeof(double));
tx = (double *) S_alloc(nsample, sizeof(double));
win = (double *) S_alloc(nsample, sizeof(double));
tp = (double *) S_alloc(nsample, sizeof(double));
out = (int *) S_alloc(nsample, sizeof(int));
bestsplitnext = (int *) S_alloc(*nrnodes, sizeof(int));
bestsplit = (int *) S_alloc(*nrnodes, sizeof(int));
nodepop = (int *) S_alloc(*nrnodes, sizeof(int));
nodestart = (int *) S_alloc(*nrnodes, sizeof(int));
jin = (int *) S_alloc(nsample, sizeof(int));
nodex = (int *) S_alloc(nsample, sizeof(int));
nodexts = (int *) S_alloc(ntest, sizeof(int));
ta = (int *) S_alloc(nsample, sizeof(int));
ncase = (int *) S_alloc(nsample, sizeof(int));
jerr = (int *) S_alloc(nsample, sizeof(int));
varUsed = (int *) S_alloc(mdim, sizeof(int));
jtr = (int *) S_alloc(nsample, sizeof(int));
jvr = (int *) S_alloc(nsample, sizeof(int));
classFreq = (int *) S_alloc(nclass, sizeof(int));
jts = (int *) S_alloc(ntest, sizeof(int));
idmove = (int *) S_alloc(nsample, sizeof(int));
at = (int *) S_alloc(mdim*nsample, sizeof(int));
a = (int *) S_alloc(mdim*nsample, sizeof(int));
b = (int *) S_alloc(mdim*nsample, sizeof(int));
mind = (int *) S_alloc(mdim, sizeof(int));
nright = (int *) S_alloc(nclass, sizeof(int));
nrightimp = (int *) S_alloc(nclass, sizeof(int));
nout = (int *) S_alloc(nclass, sizeof(int));
if (oobprox) {
oobpair = (int *) S_alloc(near*near, sizeof(int));
}
/* Count number of cases in each class. */
zeroInt(classFreq, nclass);
for (n = 0; n < nsample; ++n) classFreq[cl[n] - 1] ++;
/* Normalize class weights. */
normClassWt(cl, nsample, nclass, *ipi, classwt, classFreq);
if (stratify) {
/* Count number of strata and frequency of each stratum. */
nstrata = 0;
for (n = 0; n < nsample0; ++n)
if (strata[n] > nstrata) nstrata = strata[n];
/* Create the array of pointers, each pointing to a vector
of indices of where data of each stratum is. */
strata_size = (int *) S_alloc(nstrata, sizeof(int));
for (n = 0; n < nsample0; ++n) {
strata_size[strata[n] - 1] ++;
}
strata_idx = (int **) S_alloc(nstrata, sizeof(int *));
for (n = 0; n < ns
- 1
- 2
前往页