# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from souhu.items import SouhuItem
import copy
class SouhucarSpider(scrapy.Spider):
name = 'souhucar'
allowed_domains = ['souhu.com']
start_urls = ['http://db.auto.sohu.com/home/']
def parse(self, response):
with open('res.html', 'wb') as f:
f.write(response.body)
allcars = response.xpath('//ul[@class="tree"]//li[@class="close_child"]')
for cars in allcars:
total_brand = cars.xpath('./h4[@class="brand_tit"]/a/text()')[1].extract()
total_brand_url = cars.xpath('./h4[@class="brand_tit"]/a/@href').extract()[0]
car_cons = cars.xpath('./ul[@class="tree_con"]')
for car_con in car_cons:
brands_name = car_con.xpath('./li[@class="con_tit"]/a/text()')[1].extract().strip()
brands_url = car_con.xpath('./li[@class="con_tit"]/a/@href').extract()[0]
brands = car_con.xpath('./li/a[@class="model-a"]')
for brand in brands:
car_list_name = brand.xpath('./text()')[1].extract().strip()
car_list_url = 'http:' + brand.xpath('./@href').extract()[0]
item = SouhuItem()
item['总车品牌'] = total_brand
item['总车品牌url'] = 'http:'+total_brand_url
item['车品牌'] = brands_name
item['车品牌url'] = 'http:'+brands_url
item['车系'] = car_list_name
item['车系url'] = car_list_url
# http://db.auto.sohu.com/yiqiaudi/4207
item['车系id'] = car_list_url.split('/')[-1]
yield Request(car_list_url, self.parse_list, meta={'item':item}, dont_filter=True)
def parse_list(self, response):
item = response.meta['item']
car_sss = response.xpath('//div[@class="pri_box"]/a[2]/@href')
if len(car_sss):
item['车系询底价url'] = response.xpath('//div[@class="pri_box"]/a[2]/@href').extract()[0]
item['车系询底价id'] = item['车系id']
else:
item['车系询底价url'] = '-'
item['车系询底价id'] = '-'
models = response.xpath('//table[@class="b jsq"]')
for model in models:
n_item = copy.deepcopy(item)
n_item['车型概述'] = model.xpath('./thead/tr/th[1]/text()').extract()[0]
car_models = model.xpath('./tbody/tr[@class="s_list"]')
for car_model in car_models:
new_item = copy.deepcopy(n_item)
new_item['车型'] = car_model.xpath('./td[1]/a/text()').extract()[0]
new_item['车型url'] = 'http:' + car_model.xpath('./td[1]/a/@href').extract()[0]
new_item['车型关注id'] = new_item['车型url'].split('/')[-1].split('?')[0]
car_price = car_model.xpath('./td[4]/a/@href')
if len(car_price)>1:
new_item['车型询底价url'] = 'http:' +car_price[1].extract()
new_item['车型询底价id'] = new_item['车型关注id']
else:
new_item['车型询底价url'] = '-'
new_item['车型询底价id'] = '-'
car_id = new_item['车型关注id']
new_item['对比价的url'] = 'http://db.auto.sohu.com/pk/trim/'+car_id+'_131721/compare'
yield new_item
没有合适的资源?快使用搜索试试~ 我知道了~
python爬取搜狐汽车网所有车
共24个文件
xml:8个
py:8个
pyc:5个
需积分: 27 29 下载量 69 浏览量
2018-01-16
11:45:16
上传
评论 1
收藏 756KB ZIP 举报
温馨提示
Python爬取搜狐汽车网,基于python3,将数据跑村为xls格式,所有车品牌车系车型包括id
资源推荐
资源详情
资源评论
收起资源包目录
souhu.zip (24个子文件)
souhu
car.xls 6.64MB
souhu
settings.py 3KB
pipelines.py 1KB
middlewares.py 2KB
__init__.py 0B
items.py 814B
__pycache__
items.cpython-36.pyc 724B
__init__.cpython-36.pyc 130B
settings.cpython-36.pyc 296B
spiders
souhucar.py 3KB
__init__.py 161B
__pycache__
souhucar.cpython-36.pyc 2KB
__init__.cpython-36.pyc 138B
scrapy.cfg 254B
.idea
souhu.iml 515B
webServers.xml 607B
misc.xml 269B
remote-mappings.xml 499B
workspace.xml 115KB
encodings.xml 159B
inspectionProfiles
Project_Default.xml 444B
deployment.xml 465B
modules.xml 262B
start.py 122B
共 24 条
- 1
资源评论
从前重前
- 粉丝: 218
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功