# 第一题
读入数据,并提取出"2020/8/25-2020/8/28"所在的行:
```{R}
# 读取数据
table = read.table("E://2022数学建模//1_临界值.csv",header = T, sep = ',',encoding = 'UTF-8')
A_data = read.table("E://2022数学建模//1_B_ORIGINAL.csv", header =T, sep =",",encoding = 'UTF-8')
# 并提取出“2020/8/25-2020/8/28”所在的行
index = c(which(A_data[,1]=="2020/8/25"),
which(A_data[,1]=="2020/8/26"),
which(A_data[,1]=="2020/8/27"),
which(A_data[,1]=="2020/8/28"))
# 提取出“2020/8/25-2020/8/28”四日的污染物浓度
list(print(index),as.matrix(A_data[index,3:8]))
```
计算四日的IAQI
```{R}
### 计算四日的IAQI
IAQI=matrix(NA,4,6)
`colnames<-`(IAQI,c('SO2','NO2','PM10','PM2.5','O3','CO'))
for(zz in (1:4)){
for (jj in (1:6)) {
for ( ii in (2:8)){
if(table[(jj+1),(ii+1)]>A_data[(497+zz),(jj+2)] & A_data[(497+zz),(jj+2)]>table[(jj+1),ii]){
BP_hi=table[(jj+1),(ii+1)]
BP_lo=table[(jj+1),(ii)]
IAQI_hi=table[1,(ii+1)]
IAQI_lo=table[1,ii]
break
}else{
BP_hi=0
BP_lo=0
IAQI_hi=0
IAQI_lo=0
}
}
IAQI[zz,jj]=(IAQI_hi-IAQI_lo)/(BP_hi-BP_lo)*(A_data[(497+zz),(jj+2)]-BP_lo)+IAQI_lo
}
}
## 每行表示每一日的IAQI值
`colnames<-`(IAQI,c('SO2','NO2','PM10','PM2.5','O3','CO'))
days = c("2020/08/25","2020/08/26","2020/08/27","2020/08/28")
# 每一行取最大值
for (i in (1:4)) {
print(paste(days[i],"AQI",round(max(IAQI[i,]))))
}
```
# 第二题
## 2.1 数据处理
第二题,首先将附件一"监测点A逐小时污染物浓度与气象实测数据"中的"-"替换为"NA",再读入数据进行处理
```{R}
library(forecast)
library(zoo)
library(ggplot2)
#install.packages("hrbrthemes")
library(hrbrthemes)
library(ggplot2)
library(ggpubr)
#install.packages("VIM")
library(VIM)
library(mice)
### 第二题 ------
# 读实测数据, 填充空----
#install.packages("VIM")
library(VIM)
# 将附件一中, A点每小时实测数据中的"-"换为"NA"值, 方便插值
A_real <- read.csv("E://2022数学建模//A点每时测量.csv",
header = T, na.strings = 'NA', encoding = "UTF-8-BOM")
head(A_real)
table = read.table("E://2022数学建模//1_临界值.csv",header = T, sep = ',',encoding = 'UTF-8')
```
查看六种污染物的密度曲线及箱线图,判断离群值,并将离群值赋值为空值:
```{R}
a0 = ggplot(A_real,aes(x=SO2))+geom_density(colour="steelblue")
b0 = ggplot(A_real,aes(x=NO2))+geom_density(colour="red")
c0 = ggplot(A_real,aes(x=PM10))+geom_density(colour="green")
d0 = ggplot(A_real,aes(x=PM2.5))+geom_density(colour="black")
e0 = ggplot(A_real,aes(x=O3))+geom_density(colour="orange")
f0 = ggplot(A_real,aes(x=CO))+geom_density(colour="purple")
ggarrange(a0,b0,c0,d0,e0,f0,
ncol = 2, nrow = 3)
par(mfrow=c(2,3))
# 六个污染物的箱线图,并将离群值赋为空值
data_real = A_real[,-(2)]
for (i in (2:7)) {
outlier_values = boxplot(A_real[,(i+1)],main=colnames(A_real)[(i+1)])$out
AAA = A_real[,(i+1)]
AAA[which(AAA %in% outlier_values)]=NA
data_real[,i]=AAA
}
head(data_real)
```
将浓度小于0的值赋值为空值
```{R}
# 六个污染物的浓度小于0的值赋值为控制
for (i in (2:7)) {
zero = which(data_real[,i]<0)
AAA = data_real[,i]
AAA[zero]=NA
data_real[,i]=AAA
}
# 检验是否还有数据浓度小于0
which(data_real<0)
```
对缺失值进行可视化:
```{R}
# 缺失值总结
summary(data_real)
# 哪些行有缺失值
head(data_real[!complete.cases(data_real),])
# 缺失值可视化
aggr(data_real, prop = F, number = T)
# 缺失值总数
sum(is.na(data_real))
```
用多重插补法进行插补:
```{R}
## 填充缺失值
temp1 <-mice(data_real[,(-1)],m=6,method = "pmm",maxit = 30)
# 缺失值可视化
aggr(complete(temp1), prop = F, number = T)
# 插值的密度曲线
densityplot(temp1)
# 补全后的A点实测值
complete_A_real = complete(temp1)
head(complete_A_real)
```
对插值后的数据进行可视化处理
```{R}
# 补全后的离群值
par(mfrow=c(2,3))
boxplot(complete_A_real[,1], main="SO2")
boxplot(complete_A_real[,2], main="NO2")
boxplot(complete_A_real[,3], main="PM10")
boxplot(complete_A_real[,4], main="PM2.5")
boxplot(complete_A_real[,5], main="O3")
boxplot(complete_A_real[,6], main="CO")
# 一次绘制六种污染物浓度的分布密度曲线(插值后)
a1 = ggplot(complete_A_real,aes(x=SO2))+geom_density(colour="steelblue")
b1 = ggplot(complete_A_real,aes(x=NO2))+geom_density(colour="red")
c1 = ggplot(complete_A_real,aes(x=PM10))+geom_density(colour="green")
d1 = ggplot(complete_A_real,aes(x=PM2.5))+geom_density(colour="black")
e1 = ggplot(complete_A_real,aes(x=O3))+geom_density(colour="orange")
f1 = ggplot(complete_A_real,aes(x=CO))+geom_density(colour="purple")
ggarrange(a1,b1,c1,d1,e1,f1,
ncol = 2, nrow = 3)
# 插值后是否有缺失值检查
anyNA(complete_A_real)
# 写为 .csv 文件
# write.csv(complete_A_real,'A点每时实测(插值).csv')
```
下面计算各个污染物实测浓度的逐日均值:
```{R}
# 计算各浓度的逐日均值
a = cbind(A_real[,1],complete_A_real);
time=as.POSIXct(A_real[,1]) ;
head(time)
day=strftime(time,format = "%Y-%m-%d ");
a[,1]=day;
# 查看每个时间运行时间点的样本值,
head(table(day))
head(unique(day))
complete_A_real_day = as.data.frame(a)
complete_A_real_day = complete_A_real_day[-which(complete_A_real_day[,1]==as.Date("2021-07-13")),]
# 不同污染物每天的时间点的平均浓度
df_real = aggregate(x = complete_A_real_day[, -1], by =list(complete_A_real_day[,1]), FUN = mean);head(df_real)
anyNA(df_real)
# 最大8小时平均算法
max_8 <-function(x){
x = as.vector(x)
y = c()
if(length(x)>8){
for (i in 1:(length(x)-7)) {
y[i]=mean(x[i:(i+7)])
}}else{for (i in (1:length(x))) {
y[i]=x[i]
}}
return(max(y))
}
# 将臭氧的值替换为最大八小时平均
df_real[,6]=aggregate(x = complete_A_real_day[, 6], by = list(complete_A_real_day[,1]), FUN = max_8)[,-1];
names(df_real)= c("date","SO2","NO2","PM10","PM2.5","O3","CO","TEMPER","WET","Mbar","SPEED","DIRECTION")
anyNA(df_real)
head(df_real)
# 写出
# write.csv(df_real, "E://2022数学建模//A点各浓度实测逐日均值.csv")
```
## 2.2 聚类
```{R}
## 聚类 ----
#要是没有这个包的话,首先需要安装一下
#install.packages("factoextra")
#载入包
library(factoextra)
newdata <- read.csv("E://2022数学建模//A点每时实测(插值).csv",
header = T, na.strings = 'NA', encoding = "UTF-8-BOM")
head(newdata)
anyNA(newdata)
# 分别聚类----
# 数据进行标准化
par(mfrow=c(4,3))
for (i in 4:14) {
df <- scale(newdata[,i])
# 查看数据的前五行
head(df, n = 5)
#利用k-mean是进行聚类
km_result <- kmeans(df, 5, nstart =2000,iter.max=5000)
#提取类标签并且与原始数据进行合并
dd <- cbind(newdata, cluster = km_result$cluster);head(dd)
class_1 = dd[which(dd$cluster==1),]
class_2 = dd[which(dd$cluster==2),]
class_3 = dd[which(dd$cluster==3),]
class_4 = dd[which(dd$cluster==4),]
class_5 = dd[which(dd$cluster==5),]
plot(class_1[,i],class_1[,i],type="l",
ylim = c(min(dd[,i]),max(dd[,i])),
xlim=c(min(dd[,i]),max(dd[,i])),
lwd = 3,
xlab ="",
ylab = "",
main = paste(colnames(dd)[i]),
col = "blue")
lines(class_4[,i],class_4[,i],type="l",col="red",lwd = 3)
lines(class_3[,i],class_3[,i],type="l",col="green",lwd = 3)
lines(class_2[,i],class_2[,i],type="l",col="orange",lwd = 3)
lines(class_5[,i],class_5[,i],type="l",col="purple",lwd = 3)
print(paste("第",1,"类",colnames(dd)[i],"的浓度范围为:",min(class_1[i]),"至",max(class_1[i]),sep=''))
print(paste("第",2,"类",colnames(dd)[i],"的浓度范围为:",min(class_2[i]),"至",max(class_2[i]),sep=''))
print(paste("第",3,"类",colnames(dd)[i],"的浓度范围为:",min(class_3[i]),"至",max(class_3[i]),sep=''))
print(paste("第