马蜂窝爬虫案例解析资源-CSDN文库

共26个文件

py：14个

pyc：11个

cfg：1个

python

scrapy

需积分: 42 72 浏览量 2017-03-13 16:11:50 上传评论收藏 29KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

马蜂窝爬虫案例.zip （26个子文件）

mafengwo111

spiders

mafengwo.py 16KB

mafengwo.pyc 14KB

__init__.pyc 145B

__init__.py 161B

items.py 290B

test.py 99B

pipelines.py 789B

middlewares.py 471B

jieban.pyc 808B

qyer.py 12KB

qa.pyc 823B

settings.py 1KB

user_agents.py 7KB

settings.pyc 463B

qa.py 905B

items.pyc 374B

__init__.pyc 137B

__init__.py 0B

user_agents.pyc 7KB

pipelines.pyc 1KB

html.pyc 2KB

middlewares.pyc 759B

html.py 2KB

jieban.py 790B

scrapy.cfg 266B

Begin.py 81B

# coding=utf-8 import json import re from urlparse import urljoin import scrapy from scrapy.http import Request from scrapy.selector import Selector from mafengwo111.jieban import JiebanItem from mafengwo111.qa import QAItem from mafengwo111.html import html2text, parse_time __author__ = 'zephyre' class MafengwoQaSpider(scrapy.Spider): name = 'mafengwo-qa' def parse(self, response): html_text = json.loads(response.body)['payload']['list_html'] #print(json.dumps(html_text)) for href in Selector(text=html_text).xpath( #'//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract(): '//li//div[@class="title"]/a[@href]/@href').extract(): url = urljoin(response.url, href) print("------"+url) yield Request(url=url, callback=self.parse_question) def start_requests(self): for start_idx in xrange(0, 500, 20): yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx) def parse_question(self, response): # 抓取相关问题 for related_href in response.selector.xpath( '//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract(): url = urljoin(response.url, related_href) yield Request(url=url, callback=self.parse_question) q_item = self.retrive_question(response) yield q_item # 抓取回答 qid = q_item['qid'] page = 0 page_size = 50 url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \ % (qid, page * page_size) #print("answerUrl="+url) yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size}) def retrive_question(self, response): """ 分析response，得到问题 """ #tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]') tmp = response.selector.xpath('//div[@class="q-detail"]//div[@class="pub-bar fr"]//a[@href]') try: user_href = tmp[0].xpath('./@href').extract()[0] except IndexError: self.logger.warning('Invalid response: %s' % response.url) #self.logger.warning(response.body) raise m = re.search(r'/wenda/u/(\d+)', user_href) author_id = int(m.group(1)) tmp = tmp[0].xpath('./img/@src').extract()[0] author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp) if author_avatar.endswith('pp48.gif'): author_avatar = None author_name = response.selector.xpath( '//div[@class="q-content"]//div[@class="pub-bar fr"]/a[@class="name"]/text()').extract()[0] title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0] raw_contents = \ response.selector.xpath('//div[@class="q-content"]//div[@class="q-desc"]').extract()[0] contents = html2text(raw_contents) topic = response.selector.xpath( '//div[@class="q-content"]/div[@class="q-title"]//a[@class="location"]/text()').extract()[0] #print("tmp="+tmp) #topic=re.search('<i></i>*',tmp).group(0); #print("topic="+topic) #view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1)) temp=response.selector.xpath( '//div[@class="q-detail"]//div[@class="fr"]//span[@class="atten-num"]/text()').extract()[0] print("temp="+temp) view_cnt = news=int(re.sub('浏览'.decode("utf8"),'',temp)) print("view_cnt="+str(view_cnt)) timestamp = response.selector.xpath( '//div[@class="q-content"]//div[@class="pub-bar fr"]//span[@class="time"]//span/text()').extract()[0] #timestamp = parse_time(time_str) #print('timestamp='+timestamp) #tmp = response.selector.xpath( # '//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract() #if tmp and tmp[0].strip(): # topic = tmp[0].strip() #else: # topic = None tags = response.selector.xpath( '//div[@class="q-content"]//a[@class="a-tag"]/text()').extract() #tags = [tmp.strip() for tmp in raw_tags if tmp.strip()] match = re.search(r'detail-(\d+)\.html', response.url) qid = int(match.group(1)) item = QAItem() item['source'] = 'mafengwo' item['type'] = 'question' item['qid'] = qid item['title'] = title item['author_nickname'] = author_name item['author_id'] = author_id if author_avatar: item['author_avatar'] = author_avatar item['file_urls'] = [author_avatar] item['timestamp'] = timestamp if topic: item['topic'] = topic item['contents'] = contents item['tags'] = tags item['view_cnt'] = view_cnt return item def parse_answer_list(self, response): meta = response.meta qid = meta['qid'] page = meta['page'] page_size = meta['page_size'] print('-------'+response.url) sel = Selector(text=json.loads(response.body)['payload']['list_html']) answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]') if not answer_nodes: return # 查找下一页 if len(answer_nodes) == page_size: next_page = page + 1 url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \ % (qid, next_page * page_size) yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': next_page, 'page_size': page_size}) for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'): aid = int(answer_node.xpath('./@data-aid').extract()[0]) author_node = answer_node.xpath('//a[@class="_j_filter_click avatar"]')[0] temp = author_node.xpath('./@href').extract()[0] author_id=re.search(r'(\d+)', temp).group() tmp = author_node.xpath('//a/img/@src').extract()[0] author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp) author_name = answer_node.xpath('//a[@class="name"]/text()').extract()[0] if author_avatar.endswith('pp48.gif'): author_avatar = None content_node = answer_node.xpath('//div[contains(@class,"_j_answer_html")]').extract()[0] timestamp = answer_node.xpath('//div[@class="a-operate _js_operate clearfix"]//div[@class="pub-time"]//span/text()').extract()[0] #timestamp = parse_time(time_str) #accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]')) #raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0] contents = html2text(content_node) try: vote_cnt = int(answer_node.xpath('//a[@class="btn-ding _js_zan"]//span/text()').extract()[0]) except (IndexError, ValueError): self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0]) vote_cnt = 0 item = QAItem() item['type'] = 'answer' item['source'] = 'mafengwo' item['qid'] = qid item['aid'] = aid item['author_nickname'] = author_name item['author_id'] = author_id if author_avatar: item['author_avatar'] = author_avatar item['file_urls'] = [author_avatar] item['timestamp'] = timestamp item['contents'] = contents item['vote_cnt'] = vote_cnt #i

评论收藏

内容反馈