【免费】北京PM2.5数据分析【期末大作业】【代码+文档】资源-CSDN文库

共3个文件

docx：2个

py：1个

数据分析

需积分: 0 82 浏览量 2024-05-12 10:09:39 上传评论收藏 583KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

PM2.5.zip （3个子文件）

PM2.5

bejingpm25.py 5KB

文档.docx 629KB

~$1706611108黄浩月.docx 162B

import pandas as pd import numpy as np import matplotlib.pyplot as plt import os import re from sqlalchemy import create_engine import seaborn as sn import pymysql #黄浩月 201706611108 #代码过多建议注释分段运行 def get_pm25data(): path = "./PM25/" data = pd.DataFrame() files = os.listdir(path) for file in files: print(file) if re.match(".*20170803", file) or re.match(".*20170201", file) or re.match(".*20160201.csv", file): cache = pd.read_csv(path + file, header=3, usecols=[0, 2, 3, 4, 5, 6, 7, 9, 10], encoding="latin1") else: cache = pd.read_csv(path + file, header=2, usecols=[0, 2, 3, 4, 5, 6, 7, 9, 10], encoding="latin1") data = data.append(cache) return data def write_csvxls(): bjdata = get_pm25data() bjdata.to_csv("./output/out.csv") bjdata.to_excel("./output/out.xlsx") def write_sql(): bjdata = get_pm25data() print("开始写入数据库") engine = create_engine("mysql+pymysql://root:root@localhost:3306/apple?charset=utf8") bjdata.to_sql('bjdata', engine, schema='apple', if_exists='append', index=False, index_label=False) print("写入数据库成功") def nothava2017(): bjdata = get_pm25data() bj = bjdata.loc[:, ["Date (LST)", "Value"]] bj.columns = ["date", "value"] bj.set_index(pd.to_datetime(bj['date']), inplace=True, ) bj = bj[bj.value>0] # df = bj.resample("1D").mean() df = bj.resample("1D").mean() df = df[df.value <= 75] df.groupby(df.index.year).size().plot(title="每年健康天数对比图") # print(pd.crosstab(index=bj.index.year, columns=bj.index.month)) # print(bj) def sp(str): if "-" in str: year=str.split("-")[0] else: year=str.split("/")[2][:4] return year def main(): pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('max_colwidth', 200) pd.set_option('expand_frame_repr', False) if __name__ == '__main__': main() # write_sql() # nothava2017() bjdata=get_pm25data() bj=bjdata.loc[:, ["Date (LST)","Value"]] bj.columns=["date","value"] bj.set_index(pd.to_datetime(bj['date']),inplace=True,) bj = bj[bj.value > 0] print("处理前数据总量为：{}".format(len(bj))) print("含有缺失值数量：\n{}".format(len(bj)-len(bj[bj.value>0]))) print(pd.crosstab(index=bj.index.year,columns=bj.index.month)) avg = np.mean(bj.value) print("用平均值{}进行缺失值填充".format(avg)) bj.value=np.where(bj.value<=0,avg,bj.value) print("处理后含有缺失值数量：\n{}".format(len(bj) - len(bj[bj.value > 0]))) plt.rcParams['font.sans-serif'] = ['SimHei'] bj = bj.groupby(bj.index.weekday).describe() bj =bj.iloc[:,[1,3,7]] bj.plot(kind="bar",title="周日至周一PM2.5数据对比图") print(bj.pivot_table(index = bj.index.year,columns=bj.index.month,values='value',aggfunc=np.max)) plt.rcParams['font.sans-serif'] = ['SimHei'] bj.groupby(bj.index.year).median().plot(kind="bar",title="2007-2017年pm2.5中位数") bj.groupby(bj.index.year).max().plot(title="2007-2017年pm2.5最大值") bj.groupby(bj.index.year).mean().plot(title="2007-2017年pm2.5平均值") bj.groupby(bj.index.year).min().plot(title="2007-2017年pm2.5最小值") print("2007-2017年pm2.5中位数") print(bj.groupby(bj.index.year).median().value) print("2007-2017年pm2.5最大值") print(bj.groupby(bj.index.year).max().value) print("2007-2017年pm2.5平均值") print(bj.groupby(bj.index.year).mean().value) print("2007-2017年pm2.5最小值") print(bj.groupby(bj.index.year).min().value) bj.groupby(bj.index.month).mean().plot(title="月度PM2.5数据平均值分析",) bj.groupby(bj.index.year).plot.hist(bins=50) bj01=bj.groupby(bj.index.year).sy print(bj01) bjo = get_pm25data() # df = bj.value.resample("1D").mean() bjo['date'] =bjo['date'].map(sp) print(bjo) df = pd.pivot(bjo, index=["date"],values=["value"]) print(df) df = df[df.value <= 75] print(len(df)) df.groupby(df.index.year).size().plot(title="每年健康天数对比图") print(df.groupby(df.index.year).size()) df.groupby(df.index.year).plot.hist() bj10=bjo["2010-1-1":"2010-12-31"] bj10.value.plot.hist(title="2020年PM2.5浓度分布直方图",bins=20) print(len(df.value.tolist())) print(len(pd.date_range("2017/1/1","2017/6/30",freq="1D"))) plt.scatter(pd.date_range("2017/1/1","2017/6/30"),df.value.tolist()) plt.plot(pd.date_range("2017/1/1","2017/6/30"),df.value.tolist()) grid = sn.lmplot(x="1",y="2",data=df.value.tolist()) listdata=[int(i) for i in df.value.tolist()] bj17 = bj["2017-1-1":"2017-6-30"] df = bj17.resample("1D").mean() df['x']=[i for i in range(1,len(df)+1)] print(df) grid = sn.lmplot(x="x",y="value",data=df) plt.title("线性拟合2017年上半年数据") plt.show()

评论收藏

内容反馈