import requests
from bs4 import BeautifulSoup
import bs4
import csv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from pyecharts import Map
def getHTMLText(url):#爬取网站数据
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return '爬取失败'
def fillUnivlist(ulist,html):#解析网站数据
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].text,tds[1].text,tds[2].text,tds[3].text,tds[4].text,tds[5].text,tds[6].text,tds[7].text])
def fillUnivlist2(ulist,html):#解析网站数据
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].text,tds[1].text,tds[2].text])
def writeUlistfile(ulist,dataname):#将网站存入csv文件
with open(dataname,'w',encoding = 'utf-8',newline='') as fout:
writer = csv.writer(fout)
for row in ulist:
writer.writerow(row)
def histogramvertical(dataname,x,y,c,left,right,name,xname,yname):#绘制纵向柱状图
rcParams['font.family'] = 'simhei'
plt.figure(figsize=(10,6))
sheet=pd.read_csv(dataname,encoding="utf-8")
sheet_new=sheet[left:right]
sheet_new.plot.bar(x,y,color = c)
plt.title(name,fontsize=16,fontweight='bold')
plt.xlabel(xname,fontweight='bold')
plt.ylabel(yname,fontweight='bold')
plt.show()
def histogramflat(x,y,name,xname,yname):#绘制横向柱状图
rcParams['font.family'] = 'simhei'
plt.figure(figsize=(10,6))
plt.title(name,fontsize=20)
plt.xlabel(xname,fontsize=14)
plt.ylabel(yname,fontsize=14)
plt.barh(x,y, alpha=0.6, facecolor = 'blue', edgecolor = 'yellow', label='产量')
plt.legend(loc=5)
plt.show()
def datachange(listname):#将列表中的字符串类型数字换成浮点型数字
numes = []
for i in listname:
l = float(i)
numes.append(l)
return numes
def linechart(x,y,name,xname,yname):#绘制折线图
rcParams['font.family'] = 'simhei'
plt.plot(x,y)
plt.title(name)
plt.xlabel(xname)
plt.ylabel(yname)
for a,b in zip(x,y):
plt.text(a, b+15, '%.0f' % b, ha='center', va= 'bottom',fontsize=12)
plt.show()
def piechartmake(targetlist,yearname,chartname):#绘制饼状图
rcParams['font.family'] = 'simhei'
s = pd.Series(targetlist,yearname)
plt.axis('equal')
plt.pie(s,
explode = [0,0,0,0,0,0,0],
labels = s.index,
colors=['r', 'g', 'b', 'c',"yellow",'grey','violet'],
autopct='%.2f%%',
pctdistance=0.6,
labeldistance = 1.2,
shadow = True,
startangle=0,
radius=1,
frame=False)
plt.title(chartname)
plt.show()
def readlist(filename,listname):#读取csv文件的某一列数据
with open(filename,'r',encoding = 'utf-8') as csvfile:
reader = csv.DictReader(csvfile)
column = [row[listname] for row in reader]
return column
def readline(filename,num):#读取csv文件的第num行
with open(filename,'r',encoding = 'utf-8',newline='') as csvfile:
reader = csv.reader(csvfile)
for i,rows in enumerate(reader):
if i == num:
row = rows
return row
def listsums(listname):#计算某一列数据之和
listsum = 0
for row in listname:
listsum = listsum + float(row)
return ('{:.2f}'.format(listsum))
def percentage(listname):#计算某一列中每个数据占总数的百分数,输出保留4为小数的列表
listsum = float(listsums(listname))
lt = []
result1 = 0
for row in listname:
result1 = float(row) / listsum
lt.append('{:.4}'.format(result1))
return lt
def handle_saveexcel_csv(dataname,excelname,csvname):#对数据进行清洗,存入csv和excel文件
data1 = pd.read_csv(dataname,encoding = 'utf-8')
data1 = data1.dropna(how = 'any')
data1 = data1.drop_duplicates()
data1.to_excel(excelname, index=0)
data1.to_csv(csvname, index=0)
def mapoperation(firstline,secondline,headline,subhead):#绘制在地图上表示数据的图
data = list(zip(firstline,secondline))
geo = Map(headline,subhead, title_color="#fff", title_pos="center", width=1200, height=600, background_color='#404a59')
attr, value =geo.cast(data)
geo.add("产量", attr, value,visual_range=[0, 12000],maptype='china', is_visualmap=True, type="effectScatter", is_random=True, effect_scale=5,visual_text_color='#000')
geo.show_config()
geo.render()
def findspecific(listname):#处理地名问题
di1 = []
for row in listname[:27]:
row2 = list(row)
if str(row2[0]) == str("内") or str(row2[0]) == str("黑"):
row1 = row2[0] + row2[1] + row2[2]
di1.append(row1)
else:
row1 = row2[0] + row2[1]
di1.append(row1)
return di1
url1 = 'https://www.chyxx.com/industry/202105/953391.html'
html1 = getHTMLText(url1)
uinfo1 =[]
fillUnivlist(uinfo1,html1)
writeUlistfile(uinfo1,'各种油产量初.csv')
url2 = 'https://www.chyxx.com/industry/202009/896629.html'
html2 = getHTMLText(url2)
uinfo2 =[]
fillUnivlist2(uinfo2,html2)
writeUlistfile(uinfo2,'各省产量初.csv')
handle_saveexcel_csv('各省产量初.csv', '各省产量.xlsx','各省产量.csv')
handle_saveexcel_csv('各种油产量初.csv', '各种油产量.xlsx','各种油产量.csv')
x = readlist('各省产量.csv','省市')
y = datachange(readlist('各省产量.csv','2019年原油加工量产量:万吨'))
histogramflat(x,y,'2019年全国各省份加工原油产量','2019年原油加工量产量:万吨','省市')
x_3 = findspecific(x)
mapoperation(x_3,y,'2019年全国各省份原油加工量','单位:(万吨)')
x_2 = '年份'
y_2 =['石脑油','润滑基础油','柴油' ,'煤油','汽油','燃料油','石油沥青']
color=["blue","green","yellow",'red','black','grey','violet']
histogramvertical('各种油产量.csv',x_2,y_2,color,0,7,'2014-2020年各种加工油产量','年份','产量')
oiltype = readline('各种油产量.csv',0)
yearname = readlist('各种油产量.csv','年份')
for i in range(1,8):
lt_y1 = datachange(readlist('各种油产量.csv', oiltype[i]))
lt1 = percentage(readlist('各种油产量.csv',oiltype[i]))
linechart(yearname,lt_y1,oiltype[i]+'2014-2020各年加工产量','年份','产量')
piechartmake(lt1,yearname,oiltype[i]+'2014-2020各年加工产量占比')
评论0