python爬取搜狐汽车网所有车资源-CSDN文库

共24个文件

xml：8个

py：8个

pyc：5个

python

需积分: 27 69 浏览量 2018-01-16 11:45:16 上传评论 1 收藏 756KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

souhu.zip （24个子文件）

souhu

car.xls 6.64MB

souhu

settings.py 3KB

pipelines.py 1KB

middlewares.py 2KB

__init__.py 0B

items.py 814B

__pycache__

items.cpython-36.pyc 724B

__init__.cpython-36.pyc 130B

settings.cpython-36.pyc 296B

spiders

souhucar.py 3KB

__init__.py 161B

__pycache__

souhucar.cpython-36.pyc 2KB

__init__.cpython-36.pyc 138B

scrapy.cfg 254B

.idea

souhu.iml 515B

webServers.xml 607B

misc.xml 269B

remote-mappings.xml 499B

workspace.xml 115KB

encodings.xml 159B

inspectionProfiles

Project_Default.xml 444B

deployment.xml 465B

modules.xml 262B

start.py 122B

# -*- coding: utf-8 -*- import scrapy from scrapy import Request from souhu.items import SouhuItem import copy class SouhucarSpider(scrapy.Spider): name = 'souhucar' allowed_domains = ['souhu.com'] start_urls = ['http://db.auto.sohu.com/home/'] def parse(self, response): with open('res.html', 'wb') as f: f.write(response.body) allcars = response.xpath('//ul[@class="tree"]//li[@class="close_child"]') for cars in allcars: total_brand = cars.xpath('./h4[@class="brand_tit"]/a/text()')[1].extract() total_brand_url = cars.xpath('./h4[@class="brand_tit"]/a/@href').extract()[0] car_cons = cars.xpath('./ul[@class="tree_con"]') for car_con in car_cons: brands_name = car_con.xpath('./li[@class="con_tit"]/a/text()')[1].extract().strip() brands_url = car_con.xpath('./li[@class="con_tit"]/a/@href').extract()[0] brands = car_con.xpath('./li/a[@class="model-a"]') for brand in brands: car_list_name = brand.xpath('./text()')[1].extract().strip() car_list_url = 'http:' + brand.xpath('./@href').extract()[0] item = SouhuItem() item['总车品牌'] = total_brand item['总车品牌url'] = 'http:'+total_brand_url item['车品牌'] = brands_name item['车品牌url'] = 'http:'+brands_url item['车系'] = car_list_name item['车系url'] = car_list_url # http://db.auto.sohu.com/yiqiaudi/4207 item['车系id'] = car_list_url.split('/')[-1] yield Request(car_list_url, self.parse_list, meta={'item':item}, dont_filter=True) def parse_list(self, response): item = response.meta['item'] car_sss = response.xpath('//div[@class="pri_box"]/a[2]/@href') if len(car_sss): item['车系询底价url'] = response.xpath('//div[@class="pri_box"]/a[2]/@href').extract()[0] item['车系询底价id'] = item['车系id'] else: item['车系询底价url'] = '-' item['车系询底价id'] = '-' models = response.xpath('//table[@class="b jsq"]') for model in models: n_item = copy.deepcopy(item) n_item['车型概述'] = model.xpath('./thead/tr/th[1]/text()').extract()[0] car_models = model.xpath('./tbody/tr[@class="s_list"]') for car_model in car_models: new_item = copy.deepcopy(n_item) new_item['车型'] = car_model.xpath('./td[1]/a/text()').extract()[0] new_item['车型url'] = 'http:' + car_model.xpath('./td[1]/a/@href').extract()[0] new_item['车型关注id'] = new_item['车型url'].split('/')[-1].split('?')[0] car_price = car_model.xpath('./td[4]/a/@href') if len(car_price)>1: new_item['车型询底价url'] = 'http:' +car_price[1].extract() new_item['车型询底价id'] = new_item['车型关注id'] else: new_item['车型询底价url'] = '-' new_item['车型询底价id'] = '-' car_id = new_item['车型关注id'] new_item['对比价的url'] = 'http://db.auto.sohu.com/pk/trim/'+car_id+'_131721/compare' yield new_item

评论收藏

内容反馈