python爬虫实战笔记---以轮子哥为起点Scrapy爬取知乎用户信息资源-CSDN文库

共18个文件

py：7个

pyc：6个

xml：3个

Scrapy

需积分: 50 165 浏览量 2017-10-13 14:57:04 上传评论收藏 14KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

zhihuuser.zip （18个子文件）

zhihuuser

middlewares.py 2KB

pipelines.py 718B

spiders

zhihu.py 4KB

__pycache__

__init__.cpython-35.pyc 135B

zhihu.cpython-35.pyc 4KB

__init__.py 161B

__pycache__

pipelines.cpython-35.pyc 1KB

__init__.cpython-35.pyc 127B

settings.cpython-35.pyc 719B

items.cpython-35.pyc 2KB

items.py 2KB

__init__.py 0B

settings.py 269B

scrapy.cfg 262B

.idea

misc.xml 219B

modules.xml 270B

zhihuuser.iml 398B

workspace.xml 15KB

# -*- coding: utf-8 -*- from scrapy import Request,Spider import json from ..items import UserItem class ZhihuSpider(Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] start_user = 'excited-vczh' user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}" user_query = "locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics" follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}" follows_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics" followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}" followers_query = "data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics" def start_requests(self): yield Request(self.user_url.format(user =self.start_user,include=self.user_query),callback=self.parse_user) yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows) yield Request(self.follows_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20),callback=self.parse_followers) def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request(self.follows_url.format(user = result.get('url_token'),include=self.follows_query,limit=20,offset=0),self.parse_follows) yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),self.parse_followers) def parse_follows(self, response): results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user) if 'paging' in results.keys() and results.get('paging').get('is_end') == False: next_page = results.get('paging').get('next') yield Request(next_page,callback=self.parse_follows) def parse_followers(self, response): results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user) if 'paging' in results.keys() and results.get('paging').get('is_end') == False: next_page = results.get('paging').get('next') yield Request(next_page,self.parse_followers)

评论收藏

内容反馈