# -*- coding: utf-8 -*-
import scrapy
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
class DpSpiderSpider(scrapy.spider.Spider):
name = "dp_spider"
allowed_domains = ["dianping.com"]
#只抓取前3个url以做演示,如果需要抓取更多,可以用yield创建更多得Request。
start_urls = []
for i in range(1,3):
start_urls.append('http://www.dianping.com/search/category/1/10/o10p'+str(i))
#需要处理的http状态
handle_httpstatus_list = [404,403]
def parse(self, response):
print '\n'
print 'crawl url = ', response.url
#403错误,禁止抓取,则暂停10分钟
if response.status == 403:
print 'meet 403, sleep 600 seconds'
import time
time.sleep(600)
yield Request(response.url, callback=self.parse)
#404,页面不存在,直接返回即可
elif response.status == 404:
print 'meet 404, return'
#正常处理
else:
hxs = scrapy.Selector(response)
xs = hxs.xpath('//ul[@class=\"shop-list J_shop-list\"]/li')
for x in xs:
shopid = x.xpath('a/@href').extract()[0].split('/')[-1]
print 'shopid = ', shopid
shopname = x.xpath('a/img/@title').extract()[0]
print "shopname = ", shopname

生瓜蛋子
- 粉丝: 3959
- 资源: 7441
最新资源
- 数据分析_Python技术_全面资料汇总_学习与实践_1741400354.zip
- navinreddy20_Python_1741403174.zip
- gregmalcolm_python_koans_1741399104.zip
- dida_wins_setup_release_x64_6210.exe
- 考研数据结构笔记知识点
- CIBASetup_v3.0.3.exe
- anki-25.02-windows-qt6.exe
- Notion Setup 4.5.0.exe
- Notion Calendar Setup 1.127.0 - x64.exe
- sunshine-windows-installer.exe
- PicGo-Setup-2.4.0-beta.9-x64.exe
- tcmd1150x64.exe
- Trae CN-Setup-x64.exe
- Trae-Setup-x64_2.exe
- uTools-6.1.0.exe
- YoudaoDict_fanyiweb_navigation.exe
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈


