# -*- coding: utf-8 -*-
from scrapy import Request,Spider
import json
from ..items import UserItem
class ZhihuSpider(Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
start_user = 'excited-vczh'
user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}"
user_query = "locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics"
follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}"
follows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}"
followers_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics"
def start_requests(self):
yield Request(self.user_url.format(user =self.start_user,include=self.user_query),callback=self.parse_user)
yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
yield Request(self.follows_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20),callback=self.parse_followers)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
yield Request(self.follows_url.format(user = result.get('url_token'),include=self.follows_query,limit=20,offset=0),self.parse_follows)
yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),self.parse_followers)
def parse_follows(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,callback=self.parse_follows)
def parse_followers(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,self.parse_followers)
没有合适的资源?快使用搜索试试~ 我知道了~
python爬虫实战笔记---以轮子哥为起点Scrapy爬取知乎用户信息
共18个文件
py:7个
pyc:6个
xml:3个
需积分: 50 14 下载量 165 浏览量
2017-10-13
14:57:04
上传
评论
收藏 14KB ZIP 举报
温馨提示
Scrapy入门项目1--爬取知乎用户信息 1.选定起始人:选定一个关注数量或粉丝数量多的大佬 2.获取粉丝和关注列表 3.获取列表用户信息 4.获取每位用户粉丝和关注
资源推荐
资源详情
资源评论
收起资源包目录
zhihuuser.zip (18个子文件)
zhihuuser
zhihuuser
middlewares.py 2KB
pipelines.py 718B
spiders
zhihu.py 4KB
__pycache__
__init__.cpython-35.pyc 135B
zhihu.cpython-35.pyc 4KB
__init__.py 161B
__pycache__
pipelines.cpython-35.pyc 1KB
__init__.cpython-35.pyc 127B
settings.cpython-35.pyc 719B
items.cpython-35.pyc 2KB
items.py 2KB
__init__.py 0B
settings.py 269B
scrapy.cfg 262B
.idea
misc.xml 219B
modules.xml 270B
zhihuuser.iml 398B
workspace.xml 15KB
共 18 条
- 1
资源评论
王师北
- 粉丝: 129
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功