# 目标:爬取三亚所有游记详细信息,统计并保存至文件, https://you.ctrip.com/place/sanya61.html
import numpy as np
import pandas as pd
import time, requests
import asyncio
import random
from bs4 import BeautifulSoup #导入BeautifulSoup 模块
import time
import matplotlib.pyplot as plt
import math
import json
# 当前页所有游记的列表数据
postUrl = 'https://m.ctrip.com/restapi/soa2/22670/getRecommendTravel'
# 伪造请求头、指定数据的的传输方式
requestHeaders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Content-Type': 'application/json'
}
# 1.1获取游记详情页信息 page第几页 ,index第几条
async def getTravelDetails2Dict(travelInfo,page,index):
requestUrl = 'https://you.ctrip.com/travels/Sanya61/'+str(travelInfo['id'])+'.html'
html = requests.get(requestUrl,headers=requestHeaders).content.decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
div = soup.find("div", class_="ctd_content_controls cf")
days = '0' if div.find('i',class_='days') is None else div.find('i',class_='days').next_sibling.split(':')[1]
times = '0' if div.find('i',class_='times') is None else div.find('i',class_='times').next_sibling.split(':')[1]
costs = '0' if div.find('i',class_='costs') is None else div.find('i',class_='costs').next_sibling.split(':')[1]
whos = '' if div.find('i',class_='whos') is None else div.find('i',class_='whos').next_sibling.split(':')[1]
plays = '' if div.find('i',class_='plays') is None else div.find('i',class_='plays').next_sibling.split(':')[1]
positions = '' if div.find('a',class_='gs_a_poi') is None else ",".join([a.string for a in div.find_all('a',class_='gs_a_poi')])
await asyncio.sleep(random.uniform(1.0,8.0)/10)
print(" 第{}页第{}条游记获取成功.........{}".format(str(page),str(index),requestUrl))
# 返回在详情页爬取的数据
return {
'id': travelInfo['id'],
'标题' : travelInfo['title'],
'浏览量': travelInfo['viewNumber'],
'评论数量': travelInfo['commentNumber'],
'图片数量': travelInfo['pictureNumber'],
'作者': travelInfo['author']['name'],
"游玩时长": int(days.split(" ")[0]), # 去掉‘天’
"游玩月份": int(times.split(" ")[0]), # 去掉‘月’
"人均消费": float(costs.split(" ")[0]) , # 去掉‘元’
"同行人员": whos,
"游玩项目": plays ,
"游玩景点": positions,
}
# 1 第一步 获取并返回当前页所有游记信息(包括其详情信息)
async def getCurrPageTravelInfos(page):
# 1. post 请求获取该页所有游记id https://m.ctrip.com/restapi/soa2/22670/getRecommendTravel
# 2. get请求逐个id获取详情页,https://you.ctrip.com/travels/Sanya61/${id}.html
requestPayload = {
"districtId": 61,
"head": {
"cid": "09031175415293329099",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "999",
"auth": "",
"extension":[]
},
"pageIndex": page,
"sourceFrom": 0,
"type": 3
}
try:
pageResult = requests.post(postUrl,json=requestPayload,headers=requestHeaders)
pageResult.raise_for_status()
travelInfoList = pageResult.json()['travelInfoList']
allTask = []
for (index,travelInfo) in enumerate(travelInfoList, start=1):
allTask.append(asyncio.ensure_future(getTravelDetails2Dict(travelInfo,page,index)))
currPageTravelInfos = await asyncio.gather(*allTask)
return currPageTravelInfos # 返回本页所有以id为key,value为游记信息的键值对象
except Exception as ex:
print("第{}页采集出错,出错原因: {}。".format(str(page), ex))
return []
else:
await asyncio.sleep(random.uniform(1.0,8.0)/10)
print("第{}页采集完毕-----------".format(str(page)))
# 2 第二步 统计并保存所有页数据到xlsx文件 ,返回统计表.xlsx统计结果 [(工作表名字,填入工作表的DataFrame),....]
def saveToExcel(allPageTraveInfos):
sheetCount = math.ceil(len(allPageTraveInfos)/5) # 多少个5页
with pd.ExcelWriter('结果表.xlsx') as writer:
for sheetIndex in range(sheetCount):
fivePageData = allPageTraveInfos[sheetIndex:sheetIndex+5] # sheetIndex - sheetIndex+5页
fivePageData = pd.DataFrame(np.hstack([pageData for pageData in fivePageData]).tolist())
sheetName = '第'+str(5*sheetIndex)+'-'+str((sheetIndex+1)*5)+'页'
fivePageData.to_excel(writer, sheet_name=sheetName, index=False)
worksheet = writer.sheets[sheetName] # pull worksheet object
for idx, col in enumerate(fivePageData): # loop through all columns
series = fivePageData[col]
max_len = max((
series.astype(str).map(len,na_action='ignore').max(), # len of largest item
len(str(series.name)) # len of column name/header
)) + 10 # adding a little extra space
worksheet.set_column(idx, idx, max_len) # set column width
print(' '+sheetName+"页数据保存到文件成功........")
table = pd.DataFrame(pd.DataFrame(np.hstack([pageData for pageData in allPageTraveInfos]).tolist()))
priceArea = [1500*i for i in range(20)]
costDf = pd.value_counts(values=pd.cut(table['人均消费'].values, priceArea),ascending=False).to_frame(name="人次")
costDf.index.name='价格区间' # 定义索引列名
costDf.reset_index(inplace=True) # 重置索引成为列
costDf['价格区间']=costDf['价格区间'].astype('str') # Categories类型转为字符串
monthesDf = pd.value_counts(table['游玩月份'],ascending=False).to_frame(name="人次")
monthesDf.index.name='月份'
monthesDf.reset_index(inplace=True)
positionDf = pd.value_counts(table['游玩景点'].sum().split(',')).to_frame(name='人次')
positionDf.index.name='游玩景点'
positionDf.reset_index(inplace=True)
playDf = pd.value_counts(table['游玩项目'].sum().split(',')).to_frame(name='人次')
playDf.index.name='项目玩法'
playDf.reset_index(inplace=True)
whoDf = pd.value_counts(table['同行人员'],ascending=False).to_frame(name='人次')
whoDf.index.name='同行人员'
whoDf.reset_index(inplace=True)
# 作者游玩费用和游玩时长的统计
authorGroupDf = table.groupby(['作者']).agg({'人均消费': [('消费总额','sum'),('每次平均人均消费','mean'),
('最小消费','min'),('最大消费','max')],
'游玩时长': [('总时长(天)','sum'),('最短时长(天)','min'),
('最长时长(天)','max'),('游玩次数','count')]
})
authorGroupDf.index.name='作者'
authorGroupDf.reset_index(inplace=True)
dfs = [('消费价格区间',costDf),('月份发表游记人次',monthesDf),
('作者消费和时长',authorGroupDf),('游玩景点',positionDf),
('游玩玩法',playDf),('同行人员',whoDf)]
writer = pd.ExcelWriter('统计表.xlsx', engine='xlsxwriter')
for sheetname, df in dfs: # loop through `dict` of dataframes
index = (True if sheetname=='作者消费和时长' else False)
df.to_excel(writer, sheet_name=sheetname,index=index) # send df to writer
worksheet = writer.sheets[sheetname] #