### 第五章 大数据的展示
Sys.setlocale("LC_ALL", "English")
#####——————————————————————————————#####
#####——————————————————————————————#####
##### 5.2单变量数据的展示
#####——————————————————————————————#####
#####——————————————————————————————#####
#####——————————————————————————————#####
##### 一、定性变量图
#####——————————————————————————————#####
# (一)简单条形图
#导入数据
Sys.setlocale("LC_ALL", "English")
data<-read.csv("D:/lending club/LoanStats3a.csv",
stringsAsFactors = FALSE,header=TRUE,sep=",",skip=1)
data$loan_amnt<-as.numeric(data$loan_amnt)
library(ggplot2) #加载ggplot2
data1=head(data)
ggplot(data1,aes(x=factor(id),y=loan_amnt))+geom_bar(stat="identity")
barplot(data1$loan_amnt,names=data1$id)
# (二)频数条形统计图
ggplot(data,aes(x=grade))+geom_bar()
# (三)簇状条形图、堆积条形图
ggplot(data,aes(x=grade,fill=term))+geom_bar()
ggplot(data,aes(x=grade,fill=term))+geom_bar(position="dodge")
ggplot(data,aes(x=grade,fill=term))+geom_bar(position=position_dodge(0.9),width=0.7)
#(四)条形图转化为饼图
ggplot(data,aes(x=factor(1),fill=grade))+geom_bar()+coord_polar(theta="y")
a<-data.frame(table(data$grade))
pie(a$Freq,labels = a$Var,radius=1)
pie3D(a$Freq,labels = a$Var,radius=1.5,explode=0.1)
#(五)Cleveland点图
data3<-data[1:10,] #选取原数据前10个样本
#绘图只需要采用geom_point函数
ggplot(data3,aes(x=reorder(id,installment),y=installment))+geom_point(size=5)
#####——————————————————————————————#####
##### 二、定量变量制图
#####——————————————————————————————#####
# (一)频数直方图
#首先处理缺失值,为方便不妨将installment这一列作为子集提取出来
data4<-subset(data,select=c(installment,grade,term))
data5<-na.omit(data4) #使用na.omit函数删除缺失数据
binsize<-diff(range(data5$installment))/40 #以binsize表示求得的组距
ggplot(data5,aes(x=installment))+geom_histogram(binwidth=binsize,fill="pink",colour="blue")
#最后进行绘图,fill参数和colour参数分别对柱状和边框着色。
ggplot(data5,aes(x=installment))+
geom_histogram(aes(y=..density..),binwidth=binsize,fill="pink",colour="blue")+
stat_function(fun=dnorm,args=list(mean(data5$installment),sd(data5$installment)),size=1)+
geom_density(colour="blue",size=1)
# (二)加入分类变量的直方图
p<-ggplot(data5,aes(x=installment,fill=grade))
p+geom_histogram(position="identity",alpha=0.4)
# (三)分面图形
p<-ggplot(data5,aes(x=installment,fill=grade))
p+geom_histogram()+facet_grid(.~grade)# 加入一个分类变量的分面直方图
p+geom_histogram()+facet_wrap(~grade)# 加入一个分类变量的分面直方图
p+geom_histogram()+facet_wrap(~grade,scales="free")#自动调整坐标刻度的分面直方图
#####——————————————————————————————#####
#####——————————————————————————————#####
##### 5.3 多变量数据的展示
#####——————————————————————————————#####
#####——————————————————————————————#####
#####——————————————————————————————#####
##### 一、二维变量的展示
#####——————————————————————————————#####
#(一)基本散点图绘制
keyindicators<-read.csv("D:/World Bank Data/keyindicators.csv",header=TRUE,sep=",",encoding="UTF-8")
keyindicators$Income.Group<-factor(keyindicators$Income.Group,order=TRUE,levels = c("High income: nonOECD","High income: OECD","Upper middle income","Lower middle income","Low income"))
head(keyindicators[,c("country","aGNI","life_exp_m","life_exp_f")]) #列出所用的列
keyindicators1<-subset(keyindicators,select = c(country,Income.Group,Region,population,density_pop.,composition_pop,GNI,aGNI,life_exp_f,life_exp_m,literacy))
keyindicators1<-keyindicators1[complete.cases(keyindicators1),]
# 1、国民人均收入和预期寿命数据的基本散点图
par(mfrow=c(1,2))
plot(log(keyindicators1$aGNI),keyindicators1$life_exp_f,xlab="国民人均收入",ylab="女性预期寿命",pch=2,cex=0.6)
plot(log(keyindicators1$aGNI),keyindicators1$life_exp_m,xlab="国民人均收入",ylab="男性预期寿命",pch=3,cex=0.6)
# 2、ggplot2包绘制的散点图
qplot(log(aGNI),life_exp_f,data=keyindicators1)
qplot(log(aGNI),life_exp_m,data=keyindicators1)
ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f))+geom_point()
ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_m))+geom_point()
# (二)散点图的加工与美化
# 1、添加趋势线
#创建一个图层对象
p<-ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f))+geom_point()
#添加一条线性拟合曲线,置信度默认为0.95
p+stat_smooth(method=lm)
#改置信度为0.99
p+stat_smooth(method=lm,level=0.99)
#不添加置信区间
p+stat_smooth(method=lm,se=FALSE)
#添加局部加权多项式回归线
p+stat_smooth(method=loess)
# 2、添加边际地毯
ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f))+geom_point()+geom_rug()
# 3、添加标签
p<-ggplot(keyindicators1[1:15,],aes(x=log(aGNI),y=life_exp_f))+geom_point()
p+annotate("text",x=log(keyindicators1[7,8]),y=keyindicators1[7,9],label=keyindicators1[7,1])
p+geom_text(aes(label=country),size=3)
#####——————————————————————————————#####
##### 二、三维变量的展示
#####——————————————————————————————#####
# (一)三位散点图
keyindicators2<-keyindicators[,c("country","population","aGNI","life_exp_f")]
head(keyindicators2[,c("country","population","aGNI","life_exp_f")])
library(scatterplot3d)
with(keyindicators1,{
s3d<-scatterplot3d(log(GNI),log(population),life_exp_f,highlight.3d=TRUE) #创建三维散点图
fit<-lm(life_exp_f~log(GNI)+log(population))
s3d$plane3d(fit,col="blue") #添加趋势面
})
# (二)气泡图
ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f,size=population))+
geom_point(shape=21,colour="black",fill="lightblue")+scale_size_area(max_size = 25)
#scale_size_area是指定数值映射至圆的面积,缺省此函数将默认生成数值映射至圆的半径
#(三)分类和分面展示
#1、通过数据点的颜色或形状来分类展示
#点的形状
ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f,shape=Income.Group))+geom_point(size=1.5)
#点的颜色
ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f,colour=Income.Group))+geom_point(size=2.5)
# 2、分面展示
coplot(life_exp_f~log(aGNI)|Income.Group,data=keyindicators,rows=1,cex=1.3,pch=16)
coplot(life_exp_f~log(aGNI)|Income.Group+Region,data=keyindicators,rows=1,cex=1,pch=16)
##ggplot做分面图
basic<-ggplot(keyindicators1,aes(x=log(aGNI),y=life_exp_f))+geom_point()
#用收入水平做分面
basic+facet_grid(Income.Group~ .)
#用地区来做分面
basic+facet_grid(. ~ Region )
#联合用收入水平与地区做分面
basic+facet_grid(Income.Group ~ Region )
#####——————————————————————————————#####
##### 三、二维变量的密度图
#####�