#随机森林
# 1、importance() 函数的用法
install.packages("randomForest")
library(randomForest)
#1972~1974年车型#
set.seed(5)
data(mtcars)
a<-mtcars
head(a)
str(a)
dim(a)
names(a)
#基于数据集建立模型
model <-randomForest(mpg~.,data=a,ntree=1000,importance=T)
summary(model)
importance(model) #提取随机森林模型中的重要值
importance(model,type=1) #不对参数type进行设定,系统将默认将两种不同的度量标准下的重要值全部输出
#2。MDSplot()函数
set.seed(5)
data(iris)
b<-iris
model2 <-randomForest(Species~.,iris,proximity=T)
MDSplot(model2,b$Species,palette=rep(1,3),pch=as.numeric(b$Species),k=4)
#尝试改变k的取值。默认值=2
#3.rfImpute()函数 还是iris数据集
b.na <-b
b.na[75,2] <-NA
b.na[125,3] <-NA #在75和125样本中设置缺失值.[,2]和[,3]第几个变量
set.seed(121)
b.in <-rfImpute(Species~.,data=b.na) #进行插值处理
list("real"=b[c(75,125),1:4],"have-NA"=b.na[c(75,125),1:4],
"disposed"=round(b.in[c(75,125),2:5],1))
#"have-NA"插值显示#
#4.treesize()
data(iris)
iris.rf <-randomForest(Species~.,iris)
summary(iris.rf)
hist(treesize(iris.rf))
#可视化 绘制误差与随机森林中决策树数量的关系图
data(airquality)
set.seed(45)
str(airquality)
dim(airquality)
model3 <-randomForest(Ozone~.,data=airquality,mtry=3,importance=T,
na.action=na.omit) #mtry:每次分支时所选择的变量个数。
plot(model3)
#应用实例酒的分类 wine#
wine <-read.csv("F:\\学习文件及课件\\R语言相关\\数据\\神经网络酒的分类.csv")
str(wine)
names(wine) <-c("fixedacidity","volatileacidity","citricacid","residualsugar","chlorides","freesulfurdioxide","totalsulfurdioxide","density","pH","sulphates","alcohol","quality")
summary(wine)
dim(wine)
#数据处理
cha=0
for(i in 1:4898)
{
if(wine[i,12]>6)cha[i]="good"
else if(wine[i,12]>5)cha[i]="mid"
else cha[i]="bad"
}
wine[,12]=factor(cha)
summary(wine$quality)
#建立模型方式1
set.seed(101)
train <-sample(1:4898,3000)
set.seed(111)
model5 <-randomForest(quality~.,data=wine,importance=T,proximity=T,
ntree=500,subset=train)
#建立模型方式2
x <-subset(wine,select=-quality)
y <-wine$quality
set.seed(78)
xr <-x[train,]
yr <-y[train]
set.seed(111)
model5.1<-randomForest(xr,yr,importance=T,proximity=T,ntree=500)
print(model5)
#自变量的重要程度
importance(model5)
#优化建模
n=ncol(wine)-1 #计算数据中自变量的个数
rate=1 #设置模型中误判率的初始值
for(i in 1:n)
{
set.seed(222)
model6 <-randomForest(quality~.,data=wine,mtry=i,importance=T,ntree=1000)
rate[i] <-mean(model6$err.rate) #计算基于OOB数据的模型的误判率均值
}
rate
min(rate)
which(min(rate)==rate,arr.ind=T) #找到最小值的位置
#可视化
set.seed(222)
model6.1<-randomForest(quality~.,data=wine,mtry=2,importance=T,ntyee=1000)
plot(model6.1,col=1:1)
legend(450,0.215,"mid",cex=0.9,bty="n") #添加图例
legend(450,0.28,"bad",cex=0.9,bty="n")
legend(450,0.37,"good",cex=0.9,bty="n")
legend(450,0.245,"total",cex=0.9,bty="n")
#确定最优模型
set.seed(222)
model6.2<-randomForest(quality~.,data=wine,mtry=2,importance=T,ntyee=400,
proximity=T)
print(model6.2)
hist(treesize(model6.2)) #展示每棵决策树的节点数
max(importance(model6.2))
max(treesize(model6.2))
min(treesize(model6.2))
MDSplot(model6.2,wine$quality,palette=rep(1,3),pch=as.numeric(wine$quality))
评论0