library(ggplot2)
data("diamonds")
head(diamonds)
tail(diamonds)
setwd("C:/Users/User/Documents/R/R语言作业")
summary(diamonds)
library(mice)
md.pattern(diamonds)
m<-duplicated(diamonds)#建立是否重复索引
data<- diamonds[!m,]#去掉重复行
diamonds$price<-as.numeric(as.character(diamonds$price))
boxplot(diamonds$price)
diamonds[which(diamonds$price %in% boxplot.stats(diamonds$price)$out),]#挑选出异常点
diamonds1<-diamonds[-which(diamonds$price %in%boxplot.stats(diamonds$price)$out),]#存在异常点将异常点删除
write.csv(diamonds1,"diamonds1.csv")#保存删除异常点的数据集
summary(diamonds1$price )
t1<-as.data.frame(table(diamonds1$price))
plot(t1)
hist(diamonds1$price,col=c("grey"),freq = F,xlab="price",main="砖石价格的直方图")#砖石价格的直方图
lines(density(diamonds1$price),col="red",lwd=2)#核密度曲线
#添加正态分布曲线
xfit<-seq(min(diamonds1$price),max(diamonds1$price),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$price),sd=sd(diamonds1$price))
lines(xfit,yfit,col="blue",lwd=2)
t2<-table(diamonds1$price)
barplot(t2)
t3<-as.data.frame(t2)
t4<-t3[order(-t3$Freq),]#按砖石数降序排名
barplot(t4$Freq[1:10],names.arg=t4$Var1[1:10],main="砖石数最多的前十个价格")#砖石数最多的前十个价格
summary(diamonds1$carat)
hist(diamonds1$carat,col=c("grey"),freq = F,xlab="carat",main="钻石的克拉重量直方图")
lines(density(diamonds1$carat),col="red",lwd=2)
xfit<-seq(min(diamonds1$carat),max(diamonds1$carat),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$carat),sd=sd(diamonds1$carat))
lines(xfit,yfit,col="blue",lwd=2)
table(diamonds1$cut)
slices <- as.numeric(table(diamonds1$cut))
lbls <- c("Fair", "Good", "Very Good", "Premium", "Ideal")
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels = lbls2, main="切割质量所占比例图")
table(diamonds1$color)
slices <- as.numeric(table(diamonds1$color))
lbls <- c("D", "E", "F", "G", "H","I", "J")
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels = lbls2, main="切割颜色所占比例图")
table(diamonds1$clarity)
slices <- as.numeric(table(diamonds1$clarity))
lbls <- c("I1", "SI2", "SI1", "VS2", "VS1","VVS2", "VVS1","IF")
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels = lbls2, main="钻石清晰度所占比例图")
summary(diamonds1$depth)
hist(diamonds1$depth,col=c("grey"),freq = F,xlab="depth",main="总深度百分比直方图")
lines(density(diamonds1$depth),col="red",lwd=2)
xfit<-seq(min(diamonds1$depth),max(diamonds1$depth),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$depth),sd=sd(diamonds1$depth))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$table)
hist(diamonds1$table,col=c("grey"),freq = F,xlab="table",main="钻石顶部相对于最宽点的宽度直方图")
lines(density(diamonds1$table),col="red",lwd=2)
xfit<-seq(min(diamonds1$table),max(diamonds1$table),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$table),sd=sd(diamonds1$table))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$x)
hist(diamonds1$x,col=c("grey"),freq = F,xlab="x",main="钻石长度直方图")
lines(density(diamonds1$x),col="red",lwd=2)
xfit<-seq(min(diamonds1$x),max(diamonds1$x),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$x),sd=sd(diamonds1$x))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$y)
hist(diamonds1$y,col=c("grey"),freq = F,xlab="y",main="钻石宽度直方图")
lines(density(diamonds1$y),col="red",lwd=2)
xfit<-seq(min(diamonds1$y),max(diamonds1$y),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$y),sd=sd(diamonds1$y))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$z)
hist(diamonds1$z,col=c("grey"),freq = F,xlab="z",main="钻石高度直方图")
lines(density(diamonds1$z),col="red",lwd=2)
xfit<-seq(min(diamonds1$z),max(diamonds1$z),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$z),sd=sd(diamonds1$z))
lines(xfit,yfit,col="blue",lwd=2)
library(corrplot)
subs<-diamonds1[,c("carat","depth","table","price","x","y","z")]
cor(subs)#系数矩阵
#相关系数输出的结果进行可视化:
corrplot(cor(subs),type="upper",method="number",tl.pos = "d",cl.pos = "b",cl.cex = 1.5,tl.cex = 1.5,tl.col = "black",number.cex = 1.8,family="myFont")
library(ggplot2)
ggplot(data=diamonds1,aes(x=price,fill=cut))+geom_density(alpha=.3)
very_good<-subset(diamonds1,cut=="Very Good")
Premium<-subset(diamonds1,cut=="Premium")
library(nortest)
ad.test(scale(diamonds1$price))
wilcox.test(very_good$price,Premium$price,alternative = "less")
fit<-lm(price~., data=diamonds1)
summary(fit)
library(car)
influencePlot(fit, id.method="identify", main="Influence Plot", sub="Circle size is proportional to Cook's distance")
outlierTest(fit)
diamonds1<-diamonds1[-c(16284,23645,19340,19347,21863,22429,21759,19867 ,17197,23540),]
vif(fit)
library(leaps)
leaps<-regsubsets(price~.,data=diamonds1, nbest=8)
plot(leaps, scale="adjr2")
fit1<-lm(price~.-x-y-z, data = diamonds1)
summary(fit1)
par(mfrow=c(2,2))
plot(fit1)
attach(diamonds1)
par(mfrow=c(2,3))
plot(carat,price)
lines(ksmooth(carat,price, bandwidth =1,kernel = "normal"), col="red")
plot(depth,price)
lines(ksmooth(depth,price, bandwidth =1,kernel = "normal"), col="red")
plot(table,price)
lines(ksmooth(table,price, bandwidth =1,kernel = "normal"), col="red")
plot(x,price)
lines(ksmooth(x,price, bandwidth =1,kernel = "normal"), col="red")
plot(y,price)
lines(ksmooth(y,price, bandwidth =1,kernel = "normal"), col="red")
plot(z,price)
lines(ksmooth(z,price, bandwidth =1,kernel = "normal"), col="red")
dev.off()
fit2<-lm(price~.+I(depth^2)+I(table^2), data = diamonds1)
summary(fit2)
hist(price)
diamonds2= diamonds1
diamonds2$price[price >=10000] <-"F"
diamonds2$price[price < 10000 & price >=8000] <-"E"
diamonds2$price[price < 8000 & price >=6000] <-"D"
diamonds2$price[price < 6000 & price >=4000] <-"C"
diamonds2$price[price < 4000 & price >=2000] <-"B"
diamonds2$price[price < 2000] <-"A"
diamonds2$price <- factor(diamonds2$price, levels=c("A","B","C","D","E","F"))
train <- sample(nrow(diamonds2), 0.7*nrow(diamonds2))
diamonds2.train <- diamonds2[train,]
diamonds2.validate <- diamonds2[-train,]
library(party)
fit.ctree<-ctree(price~.+I(depth^2)+I(table^2), data=diamonds2.train)
ctree.pred <- predict(fit.ctree, diamonds2.validate, type="response")
ctree.perf <- table(ctree.pred, diamonds2.validate$price, dnn=c("Actual", "Predicted"))
ctree.perf
# 聚类
# 提取数字 做聚类
df <- subset(diamonds1,select = c("carat","depth","table","price","x","y","z"))
# 数据进行标准化
df1 <- scale(df)
#设置随机数种子,保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df1, 4)
diamonds1$class <- as.factor(km_result$cluster )
library(dplyr)
library(ggplot2)
diamonds1 %>%
ggplot(aes(x=class,y =carat, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="carat")
diamonds1 %>%
ggplot(aes(x=class,y =depth, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="depth")
diamonds1 %>%
ggplot(aes(x=class,y =table, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="table")
diamonds1 %>%
ggplot(aes(x=class,y =price, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="price")
diamonds1 %>%
ggplot(aes(x=class,y =x, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="x")
diamonds1 %>%
ggplot(aes(x=class,y =y, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="y")
diamonds1 %>%
ggplot(aes(x=class,y =z, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="z")

blackhawknoe
- 粉丝: 6
最新资源
- cad实训总结.docx
- ACESS创建数据库(孙家保)教学内容.ppt
- 旅游电子商务发展.doc
- 基于视觉特征的Web信息抽取技术的研究与实现的开题报告.docx
- 软件开发流程与规范.docx
- 程序设计第2章计算机网络教学案例.ppt
- 大数据时代下智慧物流系统体系构建研究.docx
- 计算机网络习题解答.docx
- 电子商务知识培训ppt课件(1).ppt
- 微机原理与接口技术试题库(含复习资料).doc
- 中小企业审计信息化问题及解决对策.docx
- 《基于web的图书馆管理系统》答辩PPT.ppt
- 《电子商务》课程标准教学案例.doc
- 电力通信技术在智能电网中的应用(1).docx
- 营销型网站建设必须把握的几个原则?.doc
- 软件公司实习报告.docx
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈



- 1
- 2
- 3
- 4
- 5
- 6
前往页