# -*- coding: utf-8 -*-
import re
import json
import datetime
try:
import urlparse as parse
except:
from urllib import parse
import scrapy
from scrapy.loader import ItemLoader
from items import ZhihuQuestionItem, ZhihuAnswerItem
class ZhihuSpider(scrapy.Spider):
name = "zhihu_sel"
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com/']
#question的第一页answer的请求url
start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
}
custom_settings = {
"COOKIES_ENABLED": True
}
def parse(self, response):
"""
提取出html页面中的所有url 并跟踪这些url进行一步爬取
如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
"""
pass
def parse_question(self, response):
#处理question页面, 从页面中提取出具体的question item
pass
def parse_answer(self, reponse):
pass
def start_requests(self):
from selenium import webdriver
browser = webdriver.Chrome(executable_path="E:/test/chromedriver.exe")
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(
"xxxx")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys(
"xxxx")
browser.find_element_by_css_selector(
".Button.SignFlow-submitButton").click()
import time
time.sleep(10)
Cookies = browser.get_cookies()
print(Cookies)
cookie_dict = {}
import pickle
for cookie in Cookies:
# 写入文件
f = open('G:/scrapy/ArticleSpider/cookies/zhihu/' + cookie['name'] + '.zhihu', 'wb')
pickle.dump(cookie, f)
f.close()
cookie_dict[cookie['name']] = cookie['value']
browser.close()
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
python分布式爬虫打造搜索引擎课程的资源 课程地址:http://coding.imooc.com/class/92.html?mc_marking=1f1eb391b59b3e4139718a46d8673049&mc_channel=syb10
资源推荐
资源详情
资源评论
收起资源包目录
ArticleSpider-resourses-master.zip (7个子文件)
ArticleSpider-resourses-master
zhihu_sel.py 3KB
zhihu_login_requests.py 3KB
ppt.rar 1.58MB
chromedriver_mac64 (2).zip 5.32MB
chromedriver_win32 (6).zip 3.2MB
chromedriver_linux64 (1).zip 3.54MB
request_captcha_cn.py 2KB
共 7 条
- 1
资源评论
博士僧小星
- 粉丝: 1907
- 资源: 5877
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功