# coding=utf-8
import json
import re
from urlparse import urljoin
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from mafengwo111.jieban import JiebanItem
from mafengwo111.qa import QAItem
from mafengwo111.html import html2text, parse_time
__author__ = 'zephyre'
class MafengwoQaSpider(scrapy.Spider):
name = 'mafengwo-qa'
def parse(self, response):
html_text = json.loads(response.body)['payload']['list_html']
#print(json.dumps(html_text))
for href in Selector(text=html_text).xpath(
#'//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract():
'//li//div[@class="title"]/a[@href]/@href').extract():
url = urljoin(response.url, href)
print("------"+url)
yield Request(url=url, callback=self.parse_question)
def start_requests(self):
for start_idx in xrange(0, 500, 20):
yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx)
def parse_question(self, response):
# 抓取相关问题
for related_href in response.selector.xpath(
'//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract():
url = urljoin(response.url, related_href)
yield Request(url=url, callback=self.parse_question)
q_item = self.retrive_question(response)
yield q_item
# 抓取回答
qid = q_item['qid']
page = 0
page_size = 50
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, page * page_size)
#print("answerUrl="+url)
yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size})
def retrive_question(self, response):
"""
分析response,得到问题
"""
#tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]')
tmp = response.selector.xpath('//div[@class="q-detail"]//div[@class="pub-bar fr"]//a[@href]')
try:
user_href = tmp[0].xpath('./@href').extract()[0]
except IndexError:
self.logger.warning('Invalid response: %s' % response.url)
#self.logger.warning(response.body)
raise
m = re.search(r'/wenda/u/(\d+)', user_href)
author_id = int(m.group(1))
tmp = tmp[0].xpath('./img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
if author_avatar.endswith('pp48.gif'):
author_avatar = None
author_name = response.selector.xpath(
'//div[@class="q-content"]//div[@class="pub-bar fr"]/a[@class="name"]/text()').extract()[0]
title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0]
raw_contents = \
response.selector.xpath('//div[@class="q-content"]//div[@class="q-desc"]').extract()[0]
contents = html2text(raw_contents)
topic = response.selector.xpath(
'//div[@class="q-content"]/div[@class="q-title"]//a[@class="location"]/text()').extract()[0]
#print("tmp="+tmp)
#topic=re.search('<i></i>*',tmp).group(0);
#print("topic="+topic)
#view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1))
temp=response.selector.xpath(
'//div[@class="q-detail"]//div[@class="fr"]//span[@class="atten-num"]/text()').extract()[0]
print("temp="+temp)
view_cnt = news=int(re.sub('浏览'.decode("utf8"),'',temp))
print("view_cnt="+str(view_cnt))
timestamp = response.selector.xpath(
'//div[@class="q-content"]//div[@class="pub-bar fr"]//span[@class="time"]//span/text()').extract()[0]
#timestamp = parse_time(time_str)
#print('timestamp='+timestamp)
#tmp = response.selector.xpath(
# '//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract()
#if tmp and tmp[0].strip():
# topic = tmp[0].strip()
#else:
# topic = None
tags = response.selector.xpath(
'//div[@class="q-content"]//a[@class="a-tag"]/text()').extract()
#tags = [tmp.strip() for tmp in raw_tags if tmp.strip()]
match = re.search(r'detail-(\d+)\.html', response.url)
qid = int(match.group(1))
item = QAItem()
item['source'] = 'mafengwo'
item['type'] = 'question'
item['qid'] = qid
item['title'] = title
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
if topic:
item['topic'] = topic
item['contents'] = contents
item['tags'] = tags
item['view_cnt'] = view_cnt
return item
def parse_answer_list(self, response):
meta = response.meta
qid = meta['qid']
page = meta['page']
page_size = meta['page_size']
print('-------'+response.url)
sel = Selector(text=json.loads(response.body)['payload']['list_html'])
answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]')
if not answer_nodes:
return
# 查找下一页
if len(answer_nodes) == page_size:
next_page = page + 1
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, next_page * page_size)
yield Request(url=url, callback=self.parse_answer_list,
meta={'qid': qid, 'page': next_page, 'page_size': page_size})
for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'):
aid = int(answer_node.xpath('./@data-aid').extract()[0])
author_node = answer_node.xpath('//a[@class="_j_filter_click avatar"]')[0]
temp = author_node.xpath('./@href').extract()[0]
author_id=re.search(r'(\d+)', temp).group()
tmp = author_node.xpath('//a/img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
author_name = answer_node.xpath('//a[@class="name"]/text()').extract()[0]
if author_avatar.endswith('pp48.gif'):
author_avatar = None
content_node = answer_node.xpath('//div[contains(@class,"_j_answer_html")]').extract()[0]
timestamp = answer_node.xpath('//div[@class="a-operate _js_operate clearfix"]//div[@class="pub-time"]//span/text()').extract()[0]
#timestamp = parse_time(time_str)
#accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]'))
#raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0]
contents = html2text(content_node)
try:
vote_cnt = int(answer_node.xpath('//a[@class="btn-ding _js_zan"]//span/text()').extract()[0])
except (IndexError, ValueError):
self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0])
vote_cnt = 0
item = QAItem()
item['type'] = 'answer'
item['source'] = 'mafengwo'
item['qid'] = qid
item['aid'] = aid
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
item['contents'] = contents
item['vote_cnt'] = vote_cnt
#i
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
马蜂窝爬虫案例.zip (26个子文件)
mafengwo111
mafengwo111
spiders
mafengwo.py 16KB
mafengwo.pyc 14KB
__init__.pyc 145B
__init__.py 161B
items.py 290B
test.py 99B
pipelines.py 789B
middlewares.py 471B
jieban.pyc 808B
qyer.py 12KB
qa.pyc 823B
settings.py 1KB
user_agents.py 7KB
settings.pyc 463B
qa.py 905B
items.pyc 374B
__init__.pyc 137B
__init__.py 0B
user_agents.pyc 7KB
pipelines.pyc 1KB
html.pyc 2KB
middlewares.pyc 759B
html.py 2KB
jieban.py 790B
scrapy.cfg 266B
Begin.py 81B
共 26 条
- 1
资源评论
张小竟
- 粉丝: 61
- 资源: 17
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功