【免费】使用爬虫技术获取携程网指定城市所有热门游记信息_使用爬虫技术获取携程网指定城市所有热门游记信息资源-CSDN文库

共5个文件

xlsx：2个

py：1个

json：1个

python

matplotlib

爬虫

数据处理

4星 · 超过85%的资源需积分: 0 139 浏览量 2023-02-11 20:27:56 上传评论 17 收藏 304KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

爬取携程网代码资源.zip （5个子文件）

爬取携程网代码资源

结果表.xlsx 57KB

code.py 15KB

data.json 240KB

code.ipynb 239KB

统计表.xlsx 33KB

# 目标：爬取三亚所有游记详细信息，统计并保存至文件， https://you.ctrip.com/place/sanya61.html import numpy as np import pandas as pd import time, requests import asyncio import random from bs4 import BeautifulSoup #导入BeautifulSoup 模块 import time import matplotlib.pyplot as plt import math import json # 当前页所有游记的列表数据 postUrl = 'https://m.ctrip.com/restapi/soa2/22670/getRecommendTravel' # 伪造请求头、指定数据的的传输方式 requestHeaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'Content-Type': 'application/json' } # 1.1获取游记详情页信息 page第几页 ,index第几条 async def getTravelDetails2Dict(travelInfo,page,index): requestUrl = 'https://you.ctrip.com/travels/Sanya61/'+str(travelInfo['id'])+'.html' html = requests.get(requestUrl,headers=requestHeaders).content.decode('utf-8') soup = BeautifulSoup(html, 'html.parser') div = soup.find("div", class_="ctd_content_controls cf") days = '0' if div.find('i',class_='days') is None else div.find('i',class_='days').next_sibling.split('：')[1] times = '0' if div.find('i',class_='times') is None else div.find('i',class_='times').next_sibling.split('：')[1] costs = '0' if div.find('i',class_='costs') is None else div.find('i',class_='costs').next_sibling.split('：')[1] whos = '' if div.find('i',class_='whos') is None else div.find('i',class_='whos').next_sibling.split('：')[1] plays = '' if div.find('i',class_='plays') is None else div.find('i',class_='plays').next_sibling.split('：')[1] positions = '' if div.find('a',class_='gs_a_poi') is None else ",".join([a.string for a in div.find_all('a',class_='gs_a_poi')]) await asyncio.sleep(random.uniform(1.0,8.0)/10) print(" 第{}页第{}条游记获取成功.........{}".format(str(page),str(index),requestUrl)) # 返回在详情页爬取的数据 return { 'id': travelInfo['id'], '标题' : travelInfo['title'], '浏览量': travelInfo['viewNumber'], '评论数量': travelInfo['commentNumber'], '图片数量': travelInfo['pictureNumber'], '作者': travelInfo['author']['name'], "游玩时长": int(days.split(" ")[0]), # 去掉‘天’ "游玩月份": int(times.split(" ")[0]), # 去掉‘月’ "人均消费": float(costs.split(" ")[0]) , # 去掉‘元’ "同行人员": whos, "游玩项目": plays , "游玩景点": positions, } # 1 第一步获取并返回当前页所有游记信息（包括其详情信息） async def getCurrPageTravelInfos(page): # 1. post 请求获取该页所有游记id https://m.ctrip.com/restapi/soa2/22670/getRecommendTravel # 2. get请求逐个id获取详情页，https://you.ctrip.com/travels/Sanya61/${id}.html requestPayload = { "districtId": 61, "head": { "cid": "09031175415293329099", "ctok": "", "cver": "1.0", "lang": "01", "sid": "8888", "syscode": "999", "auth": "", "extension":[] }, "pageIndex": page, "sourceFrom": 0, "type": 3 } try: pageResult = requests.post(postUrl,json=requestPayload,headers=requestHeaders) pageResult.raise_for_status() travelInfoList = pageResult.json()['travelInfoList'] allTask = [] for (index,travelInfo) in enumerate(travelInfoList, start=1): allTask.append(asyncio.ensure_future(getTravelDetails2Dict(travelInfo,page,index))) currPageTravelInfos = await asyncio.gather(*allTask) return currPageTravelInfos # 返回本页所有以id为key，value为游记信息的键值对象 except Exception as ex: print("第{}页采集出错，出错原因: {}。".format(str(page), ex)) return [] else: await asyncio.sleep(random.uniform(1.0,8.0)/10) print("第{}页采集完毕-----------".format(str(page))) # 2 第二步统计并保存所有页数据到xlsx文件，返回统计表.xlsx统计结果 [(工作表名字,填入工作表的DataFrame),....] def saveToExcel(allPageTraveInfos): sheetCount = math.ceil(len(allPageTraveInfos)/5) # 多少个5页 with pd.ExcelWriter('结果表.xlsx') as writer: for sheetIndex in range(sheetCount): fivePageData = allPageTraveInfos[sheetIndex:sheetIndex+5] # sheetIndex - sheetIndex+5页 fivePageData = pd.DataFrame(np.hstack([pageData for pageData in fivePageData]).tolist()) sheetName = '第'+str(5*sheetIndex)+'-'+str((sheetIndex+1)*5)+'页' fivePageData.to_excel(writer, sheet_name=sheetName, index=False) worksheet = writer.sheets[sheetName] # pull worksheet object for idx, col in enumerate(fivePageData): # loop through all columns series = fivePageData[col] max_len = max(( series.astype(str).map(len,na_action='ignore').max(), # len of largest item len(str(series.name)) # len of column name/header )) + 10 # adding a little extra space worksheet.set_column(idx, idx, max_len) # set column width print(' '+sheetName+"页数据保存到文件成功........") table = pd.DataFrame(pd.DataFrame(np.hstack([pageData for pageData in allPageTraveInfos]).tolist())) priceArea = [1500*i for i in range(20)] costDf = pd.value_counts(values=pd.cut(table['人均消费'].values, priceArea),ascending=False).to_frame(name="人次") costDf.index.name='价格区间' # 定义索引列名 costDf.reset_index(inplace=True) # 重置索引成为列 costDf['价格区间']=costDf['价格区间'].astype('str') # Categories类型转为字符串 monthesDf = pd.value_counts(table['游玩月份'],ascending=False).to_frame(name="人次") monthesDf.index.name='月份' monthesDf.reset_index(inplace=True) positionDf = pd.value_counts(table['游玩景点'].sum().split(',')).to_frame(name='人次') positionDf.index.name='游玩景点' positionDf.reset_index(inplace=True) playDf = pd.value_counts(table['游玩项目'].sum().split('，')).to_frame(name='人次') playDf.index.name='项目玩法' playDf.reset_index(inplace=True) whoDf = pd.value_counts(table['同行人员'],ascending=False).to_frame(name='人次') whoDf.index.name='同行人员' whoDf.reset_index(inplace=True) # 作者游玩费用和游玩时长的统计 authorGroupDf = table.groupby(['作者']).agg({'人均消费': [('消费总额','sum'),('每次平均人均消费','mean'), ('最小消费','min'),('最大消费','max')], '游玩时长': [('总时长（天）','sum'),('最短时长（天）','min'), ('最长时长（天）','max'),('游玩次数','count')] }) authorGroupDf.index.name='作者' authorGroupDf.reset_index(inplace=True) dfs = [('消费价格区间',costDf),('月份发表游记人次',monthesDf), ('作者消费和时长',authorGroupDf),('游玩景点',positionDf), ('游玩玩法',playDf),('同行人员',whoDf)] writer = pd.ExcelWriter('统计表.xlsx', engine='xlsxwriter') for sheetname, df in dfs: # loop through `dict` of dataframes index = (True if sheetname=='作者消费和时长' else False) df.to_excel(writer, sheet_name=sheetname,index=index) # send df to writer worksheet = writer.sheets[sheetname] #

评论收藏

内容反馈