skillcraft<-read.csv("SkillCraft1_Dataset.csv")
#1.数据预处理
skillcraft<-read.csv("SkillCraft1_Dataset.csv")
head(skillcraft)
skillcraft<-skillcraft[-1]#去掉第一列
str(skillcraft)#查看变量类型
#把因子变量转换为数值型的变量
skillcraft$TotalHours<-as.numeric(levels(skillcraft$TotalHours))[skillcraft$TotalHours]#转化为数值型的列
skillcraft$HoursPerWeek<-as.numeric(levels(skillcraft$HoursPerWeek))[skillcraft$HoursPerWeek]
skillcraft$Age<-as.numeric(levels(skillcraft$Age))[skillcraft$Age]
skillcraft<-skillcraft[complete.cases(skillcraft),]#删除带有缺失值的行
#2.把数据集分为测试集与训练集
install.packages("caret")
library(caret)
set.seed(1234)
skillcraft_sampling_vector<-createDataPartition(skillcraft$LeagueIndex,p=0.8,list=F)
skillcraft_train<-skillcraft[skillcraft_sampling_vector,]
skillcraft_test<-skillcraft[-skillcraft_sampling_vector,]
dim(skillcraft_train)#训练集与测试集的一些基本信息
dim(skillcraft_test)
table(skillcraft_train$LeagueIndex);table(skillcraft_test$LeagueIndex)
#3.装袋预测复杂技能的学习
install.packages("adabag")
install.packages("rpart")
library(adabag)
library(rpart)
skillcraft_train$LeagueIndex<-as.factor(skillcraft_train$LeagueIndex)#把被解释变量转换为因子型
baggedtree<-bagging(LeagueIndex~.,data=skillcraft_train,mfinal=100,coob=T)
names(baggedtree)
baggedtree$importance
barplot(baggedtree$importance)
baggedtree.pred<-predict(baggedtree,skillcraft_test)
names(baggedtree.pred)
baggedtree.pred$formula
baggedtree.pred$votes[100:111,]
baggedtree.pred$prob[100:111,]
baggedtree.pred$class[100:111]
baggedtree.pred$confusion
baggedtree.pred$error
error.bagging<-sum(as.numeric(as.numeric(baggedtree.pred$class)!=as.numeric(skillcraft_test$LeagueIndex)))/nrow(skillcraft_test)
error.bagging
######增强算法boosting预测复杂技能的学习
library(adabag)
boost<-boosting(LeagueIndex~.,data=skillcraft_train,control=(minsplit=1), mfinal=100)
boost.pred<-predict(boost,skillcraft_test)
boost.pred$error
names(boost.pred)
boost.pred$votes[1:10,]
baggedtree.pred$votes[1:10,]
boost.pred$class[1:10]
#混淆矩阵
boost.pred$confusion
error.boost<-sum(as.numeric(as.numeric(boost.pred$class)!=as.numeric(skillcraft_test$LeagueIndex)))/nrow(skillcraft_test)
error.boost
#随机森林
install.packages("randomForest")
library(randomForest)
set.seed(1234)
fit.forest<-randomForest(LeagueIndex~.,data=skillcraft_train,
na.raction=na.roughfix,importance=TRUE)
#na.action=na.roughfix参数可将数值变量的缺失值替换成对应列的中位数,类别变量中的缺失值
替换为对应列的众数类(若有多个众数则随机选一个)
#随机森林的重要性可由importance=TRUE参数得到,并通过importance()函数输出
fit.forest
importance(fit.forest,type=2)
#type=2参数得到的变量相对重要性就是分割该变量时节点不纯度(异质性)的下降总量对所有树取平均,
异质性由Gini系数定义
varImpPlot(fit.forest)
forest.pred<-predict(fit.forest,skillcraft_test)
forest.perf<-table(skillcraft_test$LeagueIndex,forest.pred,dnn=c("Actual","Predicted"))
forest.perf
error.forest<-sum(as.numeric(as.numeric(forest.pred)!=as.numeric(skillcraft_test$LeagueIndex)))/nrow(skillcraft_test)
error.forest
n<-ncol(skillcraft_train)-1
rate=1
for(i in 1:n)
{ set.seed(123)
model<-randomForest(LeagueIndex~.,data=skillcraft_train,importance=TRUE,
mtry=i,ntree=1000)
rate[i]<-mean(model$err.rate)
}
rate
set.seed(123)
model<-randomForest(LeagueIndex~.,data=skillcraft_train,mtry=9,importance=TRUE,
ntree=1000)
plot(model)#绘制模型误差与决策树数量关系图
set.seed(123)
model<-randomForest(LeagueIndex~.,data=skillcraft_train,mtry=9,importance=TRUE,
ntree=500)
model
forest.pred<-predict(model,skillcraft_test)
forest.perf<-table(skillcraft_test$LeagueIndex,forest.pred,dnn=c("Actual","Predicted"))
forest.perf
error.forest<-sum(as.numeric(as.numeric(forest.pred)!=as.numeric(skillcraft_test$LeagueIndex) ))/nrow(skillcraft_test)
error.forest