# -*- coding:utf-8 -*-
import scrapy
from scrapy.http import Request
from ZhiHu.items import ZhihuItem
from ZhiHu.MysqlPipelines.Mysql import NumberCheck
from scrapy.conf import settings
from ZhiHu.settings import Tool
import requests
import json
class Myspider(scrapy.Spider):
'''初始化各种参数'''
name='ZhiHu'
allowed_domains=['zhihu.com']
L = ''
K = ''
All_Num=546049 # 目标抓取量,手动填入
Save_Num=NumberCheck.find_save() # 已经抓取量
DB_Num=NumberCheck.find_db_real() # 上次爬虫,数据库的最后一条数据DB_Num
Last_Num=NumberCheck.find_last() # 获得上一次爬虫,轮子哥关注量
url='https://www.zhihu.com/api/v4/members/excited-vczh/followers?include=data%5B*%5D.locations%2Cemployments%2Cgender%2Ceducations%2Cbusiness%2Cvoteup_count%2Cthanked_Count%2Cfollower_count%2Cfollowing_count%2Ccover_url%2Cfollowing_topic_count%2Cfollowing_question_count%2Cfollowing_favlists_count%2Cfollowing_columns_count%2Cavatar_hue%2Canswer_count%2Carticles_count%2Cpins_count%2Cquestion_count%2Ccommercial_question_count%2Cfavorite_count%2Cfavorited_count%2Clogs_count%2Cmarked_answers_count%2Cmarked_answers_text%2Cmessage_thread_token%2Caccount_status%2Cis_active%2Cis_force_renamed%2Cis_bind_sina%2Csina_weibo_url%2Csina_weibo_name%2Cshow_sina_weibo%2Cis_blocking%2Cis_blocked%2Cis_following%2Cis_followed%2Cmutual_followees_count%2Cvote_to_count%2Cvote_from_count%2Cthank_to_count%2Cthank_from_count%2Cthanked_count%2Cdescription%2Chosted_live_count%2Cparticipated_live_count%2Callow_message%2Cindustry_category%2Corg_name%2Corg_homepage%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&offset=0'
response=requests.get(url, headers=settings['DEFAULT_REQUEST_HEADERS'])
parse=json.loads(response.text)
try:
# 获得最新关注者数目Now_Num,注意检验token是否过期
Now_Num=parse['paging']['totals']
# 因为关注者不时更新,需计算出真实Real_Num,分两种情况讨论
# 第一次DB_Num和Last_Num为None,第二次之后不为None
if DB_Num is not None:
if Last_Num is not None:
Real_Num = DB_Num+(Now_Num-Last_Num)-1
Save_Num = Save_Num
else:
Real_Num=All_Num
Save_Num=0
print u'目标爬取:', All_Num
print u'已经抓取:', Save_Num
print u''
print u'目前关注:',Now_Num
print u'上次关注:',Last_Num
except KeyError:
print u'\n'
print u'Authorization过期,请停止程序,重新抓取并在settings中更新'
def start_requests(self):
#每隔20页抓一次
urls=['https://www.zhihu.com/api/v4/members/excited-vczh/followers?include=data%5B*%5D.locations%2Cemployments%2Cgender%2Ceducations%2Cbusiness%2Cvoteup_count%2Cthanked_Count%2Cfollower_count%2Cfollowing_count%2Ccover_url%2Cfollowing_topic_count%2Cfollowing_question_count%2Cfollowing_favlists_count%2Cfollowing_columns_count%2Cavatar_hue%2Canswer_count%2Carticles_count%2Cpins_count%2Cquestion_count%2Ccommercial_question_count%2Cfavorite_count%2Cfavorited_count%2Clogs_count%2Cmarked_answers_count%2Cmarked_answers_text%2Cmessage_thread_token%2Caccount_status%2Cis_active%2Cis_force_renamed%2Cis_bind_sina%2Csina_weibo_url%2Csina_weibo_name%2Cshow_sina_weibo%2Cis_blocking%2Cis_blocked%2Cis_following%2Cis_followed%2Cmutual_followees_count%2Cvote_to_count%2Cvote_from_count%2Cthank_to_count%2Cthank_from_count%2Cthanked_count%2Cdescription%2Chosted_live_count%2Cparticipated_live_count%2Callow_message%2Cindustry_category%2Corg_name%2Corg_homepage%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&offset='+str(i)for i in range(0,self.Real_Num)[::-1]]
for url in urls:
yield Request(url,callback=self.parse)
def parse(self,response):
'''解析json格式数据,获得各类数据'''
data=json.loads(response.text)['data']
i=data[0]
item=ZhihuItem()
item['Save_Num']=self.Save_Num + 1
item['Last_Num']=self.Now_Num
item['Real_Num']=self.Real_Num
item['name']=i['name']
#提取标签、个人简介中的文本,用到自定义的Tool类
tool=Tool()
item['headline']=tool.replace(i['headline'])
item['description']=tool.replace(i['description'])
item['detailURL']='https://www.zhihu.com/people/'+str(i['url_token'])
item['gender']=i['gender']
item['user_type']=i['user_type']
item['is_active']=i['is_active']
if len(i['locations'])== 0:
item['locations']=''
else:
for n in i['locations']:
item['locations']=n['name']
try:
item['business']=i['business']['name']
except KeyError:
item['business']=''
#教育经历,分多种情况讨论
if len(i['educations'])== 0:
item['educations']=''
else:
content=[]
for n in i['educations']:
S='school' in n.keys()
M='major' in n.keys()
if S:
if M:
self.L=n['school']['name']+'/'+n['major']['name']
else:
self.L=n['school']['name']
else:
self.L=n['major']['name']
content.append(self.L)
item['educations']=''
for l in content:
item['educations']+=l+' '
#职业经历,分多种情况讨论
if len(i['employments'])== 0:
item['employments']=''
else:
content=[]
for n in i['employments']:
C='company' in n.keys()
J='job' in n.keys()
if C:
if J:
self.K=n['company']['name']+'/'+n['job']['name']
else:
self.K=n['company']['name']
else:
if J:
self.K=n['job']['name']
content.append(self.K)
item['employments']=''
for k in content:
item['employments']+=k+' '
item['following_count']=i['following_count']
item['follower_count']=i['follower_count']
item['mutual_followees_count']=i['mutual_followees_count']
item['voteup_count']=i['voteup_count']
item['thanked_count']=i['thanked_count']
item['favorited_count']=i['favorited_count']
item['logs_count']=i['logs_count']
item['following_question_count']=i['following_question_count']
item['following_topic_count']=i['following_topic_count']
item['following_favlists_count']=i['following_favlists_count']
item['following_columns_count']=i['following_columns_count']
item['articles_count']=i['articles_count']
item['question_count']=i['question_count']
item['answer_count']=i['answer_count']
item['pins_count']=i['pins_count']
item['participated_live_count']=i['participated_live_count']
item['hosted_live_count']=i['hosted_live_count']
print u'序号:', self.Real_Num
self.Real_Num-= 1
return item
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip基于python的各大网站爬虫学习项目源码.zip
资源推荐
资源详情
资源评论
收起资源包目录
基于python的各大网站爬虫学习项目源码.zip (57个子文件)
TouTiao
TouTiao
__init__.py 0B
pipelines.py 689B
spiders
__init__.py 161B
toutiao.py 1KB
items.py 453B
settings.py 686B
middlewares.py 2KB
scrapy.cfg 258B
entrypoint.py 74B
.idea
workspace.xml 35KB
misc.xml 218B
modules.xml 266B
TouTiao.iml 398B
setttings1.py 450B
pipelines1.py 806B
DouBan
douban-jieba.py 395B
DouBan.py 5KB
BoLiBei.py 3KB
pexels.py 5KB
XiaoHua2
scrapy.cfg 258B
entrypoint.py 74B
XiaoHua
__init__.py 0B
pipelines.py 637B
spiders
__init__.py 161B
xiaohua.py 2KB
items.py 389B
settings.py 1KB
pixabay.py 5KB
JWCJ.py 1KB
tieba.py 7KB
XiaoHua
scrapy.cfg 258B
entrypoint.py 74B
XiaoHua
__init__.py 0B
pipelines.py 523B
spiders
__init__.py 0B
xiaohua.py 3KB
items.py 439B
settings.py 960B
QSBK.py 3KB
ZhiHu1
ZhiHu
__init__.py 0B
pipelines.py 116B
MysqlPipelines
__init__.py 0B
pipelines.py 576B
Mysql.py 3KB
spiders
__init__.py 161B
zhihu.py 7KB
items.py 2KB
settings.py 6KB
middlewares.py 2KB
zhihu-entrypoint.py 72B
scrapy.cfg 254B
.idea
markdown-navigator
profiles_settings.xml 104B
workspace.xml 36KB
misc.xml 4KB
modules.xml 262B
ZhiHu.iml 398B
LaGou
LaGou.py 4KB
共 57 条
- 1
资源评论
resnetᅟᅠ
- 粉丝: 3683
- 资源: 3982
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功