import requests
from lxml import etree
from selenium import webdriver
import pandas as pd
import os
import time
headers={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'ASP.NET_SessionId=lepd22lrt5uqioo5s25j5p23; Hm_lvt_04660099568f561a75456483228a9516=1534664307; Hm_lpvt_04660099568f561a75456483228a9516=1534669012',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
def poetry():
baseurl='https://www.gushiwen.org/shiwen/default_4A1A{0}.aspx'
poems = pd.DataFrame(columns=['title', 'dynasty', 'name', 'good'])
for page in range(1,1000):
try:
url=baseurl.format(str(page))
response=requests.get(url,headers=headers).text
html=etree.HTML(response)
items=html.xpath('//div[2]/div[1]/div[@class="sons"]')
for item in items:
try:
title=item.xpath('./div[1]/p[1]/a/b')[0].text
dynasty=item.xpath('./div[1]/p[2]/a[1]')[0].text
name=item.xpath('./div[1]/p[2]/a[2]')[0].text
good=item.xpath('./div[@class="tool"]/div[@class="good"]/a/span')[0].text
poems=poems.append({'title':title,'dynasty':dynasty,'name':name,'good':good},ignore_index=True)
except:
continue
poems.to_excel('诗.xlsx')
print(page)
time.sleep(2)
except:
continue
def words():
baseurl='https://www.gushiwen.org/shiwen/default_4A2A{0}.aspx'
poems = pd.DataFrame(columns=['title', 'dynasty', 'name', 'good'])
for page in range(1,1000):
try:
url=baseurl.format(str(page))
response=requests.get(url,headers=headers).text
html=etree.HTML(response)
items=html.xpath('//div[2]/div[1]/div[@class="sons"]')
for item in items:
try:
title=item.xpath('./div[1]/p[1]/a/b')[0].text
dynasty=item.xpath('./div[1]/p[2]/a[1]')[0].text
name=item.xpath('./div[1]/p[2]/a[2]')[0].text
good=item.xpath('./div[@class="tool"]/div[@class="good"]/a/span')[0].text
poems=poems.append({'title':title,'dynasty':dynasty,'name':name,'good':good},ignore_index=True)
except:
continue
poems.to_excel('词.xlsx')
print(page)
except:
continue
def bends():
baseurl='https://www.gushiwen.org/shiwen/default_4A3A{0}.aspx'
poems = pd.DataFrame(columns=['title', 'dynasty', 'name', 'good'])
for page in range(1,146):
try:
url=baseurl.format(str(page))
response=requests.get(url,headers=headers).text
html=etree.HTML(response)
items=html.xpath('//div[2]/div[1]/div[@class="sons"]')
for item in items:
try:
title=item.xpath('./div[1]/p[1]/a/b')[0].text
dynasty=item.xpath('./div[1]/p[2]/a[1]')[0].text
name=item.xpath('./div[1]/p[2]/a[2]')[0].text
good=item.xpath('./div[@class="tool"]/div[@class="good"]/a/span')[0].text
poems=poems.append({'title':title,'dynasty':dynasty,'name':name,'good':good},ignore_index=True)
except:
continue
poems.to_excel('曲.xlsx')
print(page)
time.sleep(1)
except:
continue
def ancient():
baseurl='https://www.gushiwen.org/shiwen/default_4A4A{0}.aspx'
poems = pd.DataFrame(columns=['title', 'dynasty', 'name', 'good'])
for page in range(1,61):
try:
url=baseurl.format(str(page))
response=requests.get(url,headers=headers).text
html=etree.HTML(response)
items=html.xpath('//div[2]/div[1]/div[@class="sons"]')
for item in items:
try:
title=item.xpath('./div[1]/p[1]/a/b')[0].text
dynasty=item.xpath('./div[1]/p[2]/a[1]')[0].text
name=item.xpath('./div[1]/p[2]/a[2]')[0].text
good=item.xpath('./div[@class="tool"]/div[@class="good"]/a/span')[0].text
poems=poems.append({'title':title,'dynasty':dynasty,'name':name,'good':good},ignore_index=True)
except:
continue
poems.to_excel('文.xlsx')
print(page)
except:
continue
def concat():
"""
诗词曲文合并
:return:
"""
ds=pd.read_excel('C:/daima/dataanalysis/poem/诗.xlsx')
dc=pd.read_excel('C:/daima/dataanalysis/poem/词.xlsx')
pd.concat([ds,dc])
def infobaidu():
"""
获取诗人百度热度
:return:
"""
browser=webdriver.Chrome()
poems = pd.DataFrame(columns=['name', 'hot', 'enjoy'])
data=pd.read_excel('C:/daima/dataanalysis/poem/100.xlsx')
base_url='https://baike.baidu.com/item/{0}'
for name in data['name']:
try:
url=base_url.format(name)
browser.get(url)
time.sleep(1)
response = browser.page_source
html = etree.HTML(response)
hot = html.xpath('//*[@id="j-top-vote"]/span[@class="vote-count"]')[0].text
enjoy = html.xpath('//*[@id="j-topShareCount"]')[0].text
poems=poems.append({'name':name,'hot':hot,'enjoy':enjoy},ignore_index=True)
poems.to_excel('102.xlsx')
time.sleep(1)
except:
continue
if __name__ == '__main__':
#ancient()
#concat()
infobaidu()
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
python爬虫案例源码,python爬虫学习,python爬虫案例,python爬取在线网站数据,python爬取网站数据,python数据分析,数据获取,项目实战,python爬虫小例子,python爬虫代码示例,python爬虫简单示例,python爬虫教程学习。
资源推荐
资源详情
资源评论
收起资源包目录
Python源码获取诗人的百度热度数据.zip (13个子文件)
Python源码获取诗人的百度热度数据
__init__.py 0B
2.ipynb 39KB
1.ipynb 17KB
4.ipynb 26KB
render.html 708KB
5.ipynb 9KB
poem.py 6KB
.ipynb_checkpoints
5-checkpoint.ipynb 72B
4-checkpoint.ipynb 72B
1-checkpoint.ipynb 72B
2-checkpoint.ipynb 38KB
3-checkpoint.ipynb 72B
3.ipynb 13KB
共 13 条
- 1
资源评论
随风浪仔
- 粉丝: 598
- 资源: 1838
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- STC15单片机串口2使用程序例子
- 读取日志的excel生成周报 用python3开发weekplan-master.zip
- python 读取excel数据导入dbimport-data-master.zip
- K折交叉验证BP神经网络,多输入多输出BP神经网络(代码完整,数据齐全)
- B07训练原图.zip
- python-对Excel数据处理做可视化分析.zip
- 人工智能大作业-无人机图像目标检测的python源代码+文档说明.zip
- 基于GoogLeNet实现Cifar-10图像分类项目python源码(高分项目).zip
- 数据库 sql 面试题目及答案解析.docx
- 汽车常见 10 种传感器故障后的表现与解决措施.docx
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功