library(ggplot2)
data("diamonds")
head(diamonds)
tail(diamonds)
setwd("C:/Users/User/Documents/R/R语言作业")
summary(diamonds)
library(mice)
md.pattern(diamonds)
m<-duplicated(diamonds)#建立是否重复索引
data<- diamonds[!m,]#去掉重复行
diamonds$price<-as.numeric(as.character(diamonds$price))
boxplot(diamonds$price)
diamonds[which(diamonds$price %in% boxplot.stats(diamonds$price)$out),]#挑选出异常点
diamonds1<-diamonds[-which(diamonds$price %in%boxplot.stats(diamonds$price)$out),]#存在异常点将异常点删除
write.csv(diamonds1,"diamonds1.csv")#保存删除异常点的数据集
summary(diamonds1$price )
t1<-as.data.frame(table(diamonds1$price))
plot(t1)
hist(diamonds1$price,col=c("grey"),freq = F,xlab="price",main="砖石价格的直方图")#砖石价格的直方图
lines(density(diamonds1$price),col="red",lwd=2)#核密度曲线
#添加正态分布曲线
xfit<-seq(min(diamonds1$price),max(diamonds1$price),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$price),sd=sd(diamonds1$price))
lines(xfit,yfit,col="blue",lwd=2)
t2<-table(diamonds1$price)
barplot(t2)
t3<-as.data.frame(t2)
t4<-t3[order(-t3$Freq),]#按砖石数降序排名
barplot(t4$Freq[1:10],names.arg=t4$Var1[1:10],main="砖石数最多的前十个价格")#砖石数最多的前十个价格
summary(diamonds1$carat)
hist(diamonds1$carat,col=c("grey"),freq = F,xlab="carat",main="钻石的克拉重量直方图")
lines(density(diamonds1$carat),col="red",lwd=2)
xfit<-seq(min(diamonds1$carat),max(diamonds1$carat),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$carat),sd=sd(diamonds1$carat))
lines(xfit,yfit,col="blue",lwd=2)
table(diamonds1$cut)
slices <- as.numeric(table(diamonds1$cut))
lbls <- c("Fair", "Good", "Very Good", "Premium", "Ideal")
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels = lbls2, main="切割质量所占比例图")
table(diamonds1$color)
slices <- as.numeric(table(diamonds1$color))
lbls <- c("D", "E", "F", "G", "H","I", "J")
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels = lbls2, main="切割颜色所占比例图")
table(diamonds1$clarity)
slices <- as.numeric(table(diamonds1$clarity))
lbls <- c("I1", "SI2", "SI1", "VS2", "VS1","VVS2", "VVS1","IF")
pct <- round(slices/sum(slices)*100)
lbls2 <- paste(lbls, " ", pct, "%", sep="")
pie(slices, labels = lbls2, main="钻石清晰度所占比例图")
summary(diamonds1$depth)
hist(diamonds1$depth,col=c("grey"),freq = F,xlab="depth",main="总深度百分比直方图")
lines(density(diamonds1$depth),col="red",lwd=2)
xfit<-seq(min(diamonds1$depth),max(diamonds1$depth),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$depth),sd=sd(diamonds1$depth))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$table)
hist(diamonds1$table,col=c("grey"),freq = F,xlab="table",main="钻石顶部相对于最宽点的宽度直方图")
lines(density(diamonds1$table),col="red",lwd=2)
xfit<-seq(min(diamonds1$table),max(diamonds1$table),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$table),sd=sd(diamonds1$table))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$x)
hist(diamonds1$x,col=c("grey"),freq = F,xlab="x",main="钻石长度直方图")
lines(density(diamonds1$x),col="red",lwd=2)
xfit<-seq(min(diamonds1$x),max(diamonds1$x),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$x),sd=sd(diamonds1$x))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$y)
hist(diamonds1$y,col=c("grey"),freq = F,xlab="y",main="钻石宽度直方图")
lines(density(diamonds1$y),col="red",lwd=2)
xfit<-seq(min(diamonds1$y),max(diamonds1$y),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$y),sd=sd(diamonds1$y))
lines(xfit,yfit,col="blue",lwd=2)
summary(diamonds1$z)
hist(diamonds1$z,col=c("grey"),freq = F,xlab="z",main="钻石高度直方图")
lines(density(diamonds1$z),col="red",lwd=2)
xfit<-seq(min(diamonds1$z),max(diamonds1$z),length=100)
yfit<-dnorm(xfit,mean = mean(diamonds1$z),sd=sd(diamonds1$z))
lines(xfit,yfit,col="blue",lwd=2)
library(corrplot)
subs<-diamonds1[,c("carat","depth","table","price","x","y","z")]
cor(subs)#系数矩阵
#相关系数输出的结果进行可视化:
corrplot(cor(subs),type="upper",method="number",tl.pos = "d",cl.pos = "b",cl.cex = 1.5,tl.cex = 1.5,tl.col = "black",number.cex = 1.8,family="myFont")
library(ggplot2)
ggplot(data=diamonds1,aes(x=price,fill=cut))+geom_density(alpha=.3)
very_good<-subset(diamonds1,cut=="Very Good")
Premium<-subset(diamonds1,cut=="Premium")
library(nortest)
ad.test(scale(diamonds1$price))
wilcox.test(very_good$price,Premium$price,alternative = "less")
fit<-lm(price~., data=diamonds1)
summary(fit)
library(car)
influencePlot(fit, id.method="identify", main="Influence Plot", sub="Circle size is proportional to Cook's distance")
outlierTest(fit)
diamonds1<-diamonds1[-c(16284,23645,19340,19347,21863,22429,21759,19867 ,17197,23540),]
vif(fit)
library(leaps)
leaps<-regsubsets(price~.,data=diamonds1, nbest=8)
plot(leaps, scale="adjr2")
fit1<-lm(price~.-x-y-z, data = diamonds1)
summary(fit1)
par(mfrow=c(2,2))
plot(fit1)
attach(diamonds1)
par(mfrow=c(2,3))
plot(carat,price)
lines(ksmooth(carat,price, bandwidth =1,kernel = "normal"), col="red")
plot(depth,price)
lines(ksmooth(depth,price, bandwidth =1,kernel = "normal"), col="red")
plot(table,price)
lines(ksmooth(table,price, bandwidth =1,kernel = "normal"), col="red")
plot(x,price)
lines(ksmooth(x,price, bandwidth =1,kernel = "normal"), col="red")
plot(y,price)
lines(ksmooth(y,price, bandwidth =1,kernel = "normal"), col="red")
plot(z,price)
lines(ksmooth(z,price, bandwidth =1,kernel = "normal"), col="red")
dev.off()
fit2<-lm(price~.+I(depth^2)+I(table^2), data = diamonds1)
summary(fit2)
hist(price)
diamonds2= diamonds1
diamonds2$price[price >=10000] <-"F"
diamonds2$price[price < 10000 & price >=8000] <-"E"
diamonds2$price[price < 8000 & price >=6000] <-"D"
diamonds2$price[price < 6000 & price >=4000] <-"C"
diamonds2$price[price < 4000 & price >=2000] <-"B"
diamonds2$price[price < 2000] <-"A"
diamonds2$price <- factor(diamonds2$price, levels=c("A","B","C","D","E","F"))
train <- sample(nrow(diamonds2), 0.7*nrow(diamonds2))
diamonds2.train <- diamonds2[train,]
diamonds2.validate <- diamonds2[-train,]
library(party)
fit.ctree<-ctree(price~.+I(depth^2)+I(table^2), data=diamonds2.train)
ctree.pred <- predict(fit.ctree, diamonds2.validate, type="response")
ctree.perf <- table(ctree.pred, diamonds2.validate$price, dnn=c("Actual", "Predicted"))
ctree.perf
# 聚类
# 提取数字 做聚类
df <- subset(diamonds1,select = c("carat","depth","table","price","x","y","z"))
# 数据进行标准化
df1 <- scale(df)
#设置随机数种子,保证实验的可重复进行
set.seed(123)
#利用k-mean是进行聚类
km_result <- kmeans(df1, 4)
diamonds1$class <- as.factor(km_result$cluster )
library(dplyr)
library(ggplot2)
diamonds1 %>%
ggplot(aes(x=class,y =carat, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="carat")
diamonds1 %>%
ggplot(aes(x=class,y =depth, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="depth")
diamonds1 %>%
ggplot(aes(x=class,y =table, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="table")
diamonds1 %>%
ggplot(aes(x=class,y =price, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="price")
diamonds1 %>%
ggplot(aes(x=class,y =x, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="x")
diamonds1 %>%
ggplot(aes(x=class,y =y, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="y")
diamonds1 %>%
ggplot(aes(x=class,y =z, fill=class)) +
geom_boxplot() +
labs(x="cluster", y="z")
- 1
- 2
- 3
- 4
- 5
- 6
前往页