import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import operator
import matplotlib.pylab as plt
import matplotlib
import matplotlib.ticker as ticker
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from xpinyin import Pinyin
#### 先获取城市ID
def City_code(city_name):
f = open("城市代码.txt")
source = f.read()
f.close()
pos = source.find(city_name)
code = int(source[pos-8:pos][0:5])
print(code)
return code
#### 获取城市昨天与今天的气温
def WENDU(code,city_name):
p = Pinyin()
pinyin = p.get_pinyin(city_name)
pinyin = pinyin.replace("-","")
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0'}
url1 = "https://tianqi.2345.com/{0}1d/{1}.htm".format(pinyin,code)
r1 = requests.get(url1,headers=headers).text
soup1 = BeautifulSoup(r1,'lxml')
wendu1 = soup1('span',attrs={"class":"other-yes fl"})
print("-------------------------------------")
print(city_name+"的"+wendu1[0].text)
url2 = "https://tianqi.2345.com/{0}2d/{1}.htm".format(pinyin,code)
r2 = requests.get(url2,headers=headers).text
soup2 = BeautifulSoup(r2,'lxml')
wendu2 = soup2('span',attrs={"class":"other-yes fl"})
wendu2 = wendu2[0].text.split(":")
print("-------------------------------------")
print(city_name+"的"+"今天:"+str(wendu2[1]))
#### 爬取城市的天气数据,并且保存在(全国城市进行任意分析中去)
def City_data_2011_2015(name,code):
print("正在爬取"+name+"2011年到2015年的天气数据:")
## 正则表达式来匹配数据
#### .是匹配除了/n之外的任何单个字符 此时为非贪心 只会匹配<td>****</td>里面的元素
r1 = '<td>(.*?)</td>'
re1 = re.compile(r1)
#### 看firefox可知为最高温度
r2 = '<td style="color:#ff5040;">(.*?)</td>'
re2 = re.compile(r2)
#### 看firefox可知为最低温度
r3 = '<td style="color:#3097fd;" >(.*?)</td>'
re3 = re.compile(r3)
### 网站的url
## 存储2011-2015的天气预报
urls = []
for j in range(2011,2016):
url_20__ = ['http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D={0}' \
'&areaInfo%5BareaType%5D=2&date%5Byear%5D={1}&date%5Bmonth%5D={2}'.format(code,j,str(i))for i in range(1,13)]
for x in range(len(url_20__)):
urls.append(url_20__[x])
### 网站的头
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0'}
datess=[]
weeks=[]
wd_hs=[]
wd_ls=[]
weathers=[]
winer=[]
## 提取数据
for url in urls:
df = requests.get(url,headers = headers).json()['data']
#正则表达式提取日期
date = [l.split(' ')[0] for l in re1.findall(df)[::3]]
for i in range(len(date)):
datess.append(date[i])
#星期
week = [l.split(' ')[1] for l in re1.findall(df)[::3]]
for i in range(len(week)):
weeks.append(week[i])
#最高温度
wd_h = re2.findall(df)
for i in range(len(wd_h)):
wd_hs.append(wd_h[i])
#最低温度
wd_w = re3.findall(df)
for i in range(len(wd_w)):
if (wd_w[i]=="°"):
wd_ls.append("0°")
else:
wd_ls.append(wd_w[i])
#天气情况
wea = re1.findall(df)[1::3]
for i in range(len(wea)):
weathers.append(wea[i])
#风力
win = re1.findall(df)[2::3]
for i in range(len(win)):
winer.append(win[i])
# time.sleep(1)
datas = pd.DataFrame()
datas['日期'] = pd.to_datetime(datess)
datas['星期'] = weeks
datas['最高气温'] = wd_hs
datas['最低气温'] = wd_ls
datas['天气情况'] = weathers
datas['风力'] = winer
#提取日期
datas['日期'] = datas['日期'].map(lambda x:str(x).split(' ')[0])
#去掉摄氏度
datas['最高气温'] = datas['最高气温'].map(lambda x:float(x.replace('°','')))
datas['最低气温'] = datas['最低气温'].map(lambda x:float(x.replace('°','')))
#计算平均气温
datas['平均气温'] = (datas['最高气温'] + datas['最低气温'])/2
datas.to_excel('全国城市进行任意分析/{}2011_2015_data_count.xlsx'.format(str(name)),index = None)
print(name+"2011年到2015年的数据保存成功!")
def City_data_2016_2021(name,code):
print("正在爬取"+name+"2016年到2021年的天气数据:")
## 正则表达式来匹配数据
#### .是匹配除了/n之外的任何单个字符 此时为非贪心 只会匹配<td>****</td>里面的元素
r1 = '<td>(.*?)</td>'
re1 = re.compile(r1)
#### 看firefox可知为最高温度
r2 = '<td style="color:#ff5040;">(.*?)</td>'
re2 = re.compile(r2)
#### 看firefox可知为最低温度
r3 = '<td style="color:#3097fd;" >(.*?)</td>'
re3 = re.compile(r3)
### 网站的url
####2021年
urls_2021 = ['http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D={0}' \
'&areaInfo%5BareaType%5D=2&date%5Byear%5D=2021&date%5Bmonth%5D={1}'.format(code,str(i))for i in range(1,12)]
## 存储后五年的天气预报
urls_20__s = []
for j in range(2016,2021):
url_20__ = ['http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D={0}' \
'&areaInfo%5BareaType%5D=2&date%5Byear%5D={1}&date%5Bmonth%5D={2}'.format(code,j,str(i))for i in range(1,13)]
for x in range(len(url_20__)):
urls_20__s.append(url_20__[x])
urls =urls_20__s+urls_2021
### 网站的头
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0'}
datess=[]
weeks=[]
wd_hs=[]
wd_ls=[]
weathers=[]
winer=[]
###空气质量的变量
kongqizhiliangs=[]
kongqi_number=[]
kongqi_chiness=[]
for url in urls:
df = requests.get(url,headers = headers).json()['data']
soup = BeautifulSoup(df,'lxml')
#正则表达式提取日期
date = [l.split(' ')[0] for l in re1.findall(df)[::4]]
for i in range(len(date)):
datess.append(date[i])
#星期
week = [l.split(' ')[1] for l in re1.findall(df)[::4]]
for i in range(len(week)):
weeks.append(week[i])
#最高温度
wd_h = re2.findall(df)
for i in range(len(wd_h)):
wd_hs.append(wd_h[i])
#最低温度
wd_w = re3.findall(df)
for i in range(len(wd_w)):
wd_ls.append(wd_w[i])
#天气情况
wea = re1.findall(df)[1::4]
for i in range(len(wea)):
weathers.append(wea[i])
#风力
win = re1.findall(df)[2::4]
for i in range(len(win)):
winer.append(win[i])
#beautifulSoup提取空气质量
kongqi = soup('span',attrs={})
#### 因为香港的数据2019-9有一段数据为空
for i in kongqi:
if (i.text)=='-':
kongqi_number.append(" ")
kongqi_chiness.append(" ")
else:
kongqi_number.append(i.text.split(" ")[0])
kongqi_chiness.append(i.text.split(" ")[1])
data = pd.DataFrame()
data['日期'] = pd.to_datetime(datess)
data['星期'] = weeks
data['最高气温'] = wd_hs
data['最低气温'] = wd_ls
data['天气情况'] = weathers
data['风力'] = winer
data['空气质量数值']=kongqi_number
data['空气质量汉字']=kongqi_chiness
#提取日期
data['日期'] = data['日期'].map(lambda x:str(x).split(' ')[0])
#去掉摄氏度
data['最高气温'] = data['最高气温'].map(lambda x:float(x.replace('°','')))
data['最低气温'] = data['最低气温'].map(lambda
天气数据爬取的源代码-可以运行
需积分: 0 44 浏览量
2023-08-06
23:23:26
上传
评论 4
收藏 7KB ZIP 举报
辣子不辣,英语不难
- 粉丝: 437
- 资源: 7
最新资源
- 基于python的高性能爬虫程序,使用了多线程+缓存+xpath实现的,这里以彼-岸图库为例,实现,仅用于学习交流
- 中分辨率成像光谱仪(MODIS)烧毁面积产品信息MODIS-C6-BA-User-Guide-1.2.pdf
- Screenshot_20240427_172613_com.huawei.browser.jpg
- 关于学习Python的相关资源网站链接及相关介绍.docx
- (HAL库)基于STM32F103C8T6的温控PID系统[Dht11、ESP8266、无线透传、L298N……]
- VoLTE高丢包优化指导书.xlsx
- Rust资源文件.zip
- 前后端分离实践:使用 React 和 Express 搭建完整登录注册流程
- gradle-publish-to-MavenLocal.zip
- 10份网络优化创新案例.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈