# -*- coding: utf-8 -*-
import scrapy
import re
import json
import time
from pyquery import PyQuery as p
from selenium import webdriver
import scrapy,pymysql,re
from scrapy import Selector
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
class AutohomeModelSpider(scrapy.Spider):
'''汽车之家 车系车型数据. 全量跑的话, chars里面有些字母会访问失败, 提取出来单独跑就行了
参数d 是模拟浏览器爬取所需的谷歌插件目录, 需要网上下载存本地
'''
name = 'autohome_model'
allowed_domains = ['wwww.autohome.com.cn', 'car.autohome.com.cn']
chars = ["A","B","C","D","E","F","G","H","I","J","K","L",'M','N',"O","P","Q","R","S","T","U","V","W","X","Y","Z"]
start_urls = ['https://www.autohome.com.cn/grade/carhtml/%s.html' % (_) for _ in chars]
d = '/Users/dsc/Downloads/chromedriver' # 谷歌 selenium 驱动
dr = webdriver.Chrome(executable_path=d)
beats = []
error_ids = []
# 创建实例
def parse(self, response):
doc = p(response.text)
items = doc.find('dl')
for i in items:
icon = p(i).find('dt img').attr('src')
brand = p(i).find('dt div a').html()
manufacturers = p(i).find('dd .h3-tit')
cars = p(i).find('dd .rank-list-ul')
for m, c in zip(manufacturers, cars):
manufacturer = p(m).find('a').html()
ids = p(c).find('li')
carnames = p(c).find('li h4 a')
for j, name in zip(ids, carnames):
id = p(j).attr('id')
if id:
id = id.replace('s', '')
self.beats.append(id)
yield self.get_html(id, manufacturer, brand, p(name).html())
def get_html(self, id, manufacturer, brand, name):
result = {
'id':'%s_%s_%s_%s'%('QXZJ', brand, manufacturer, name),
'manufacturer':manufacturer,
'brand':brand,
'name':name,
'type':'model',
'source':'汽车之家',
'models':[]
}
url = 'https://car.autohome.com.cn/config/series/%s.html' % (id)
# for id in ids:
self.dr.get(url) #
wait=WebDriverWait(self.dr,3)
try:
wait.until(EC.presence_of_element_located((By.ID,'tab_0')))
except:
self.error_ids.append(id)
return result
html = self.dr.page_source
tags = re.findall(re.compile(r'<span class="(hs_.{6,15})"></span>'), html)
keys = {}
for tag in tags:
js="return window.getComputedStyle(document.getElementsByClassName('" + tag + "')[0], 'before').getPropertyValue('content')"
word=self.dr.execute_script(js)
keys[tag] = word.replace('"', '')
tags = re.findall(re.compile(r'<span class="hs_.{6,15}"></span>'), html)
for tag in tags:
t = tag.replace('<span class="','').replace('"></span>','')
html = html.replace(tag, keys[t])
config_names = p(html).find('#config_nav td .carbox div a') # 车型名称
prices = p(p(p(html).find('.tbcs')[1]).find('tr')[0]).find('td')
names = [p(_).text() for _ in config_names]
prices = [float(re.findall(re.compile(r'\d+\.?\d+'), p(_).text())[0]) if re.findall(re.compile(r'\d+\.?\d+'), p(_).text()) else 0 for _ in prices]
config_types = p(html).find('#config_data table')[2:] # 配置的小分类
models = [{'config':config, 'price':price, 'detail': []} for config,price in zip(names, prices)]
for t in config_types:
typename = p(t).find('tr th h3 span').text()
trs = p(t).find('tr')[1:]
for m in models:
m['detail'].append({typename: []})
for tr in trs:
title = p(tr).find('th div').text()
title = title.replace('.', '//dot//') # 插入mongodb 时, key 不允许带有 '.'
values = [p(_).text() for _ in p(tr).find('td')]
for i,v in enumerate(values):
if i == len(models):
break
try:
models[i]['detail'][len(models[i]['detail'])-1][typename].append({title: v})
except:
print(len(models), i , url)
result['models'] = models
return result
# self.log(html)
# dr.quit()
# yield scrapy.Request(url % (id), callback=self.parse_item, meta={'manufacturer':manufacturer, 'brand':brand, 'icon':icon})
# def parse_item(self, response):
# doc = p(response.text)
# # title = doc.find('.subnav-title-name a').html()
# config = doc.html().split('var config =')
# if len(config) == 2:
# config = config[1].split('var option =')[0].strip()[:-1]
# self.log(config)
# config = json.loads(config)
# self.log(response.meta)
# for i in config:
# else:
# pass
'''
var keyLink = [{"id":1339,"link":"https://car.autohome.com.cn/baike/detail_8_26_1339.html","name":"<span class='hs_kw28_baikeph'></span><span class='hs_kw90_baikeph'></span>/<span class='hs_kw10_baikeph'></span>"},{"id":1340,"link":"https://car.autohome.com.cn/baike/detail_8_27_1340.html","name":"尾门玻璃<span class='hs_kw67_baikeph'></span>开启"},{"id":1341,"link":"https://car.autohome.com.cn/baike/detail_8_30_1341.html","name":"<span class='hs_kw80_baikeph'></span>数量"},{"id":1342,"link":"https://car.autohome.com.cn/baike/detail_8_31_1342.html","name":"<span class='hs_kw13_baikeph'></span>大灯雨雾模式"},{"id":1343,"link":"https://car.autohome.com.cn/baike/detail_8_30_1343.html","name":"车载CD/DVD"},{"id":1344,"link":"https://car.autohome.com.cn/baike/detail_8_30_1344.html","name":"USB/Type-C<span class='hs_kw41_baikeph'></span>数量"},{"id":2,"link":"https://car.autohome.com.cn/baike/detail_8_25_2.html","name":"ABS防抱死"},{"id":7,"link":"https://car.autohome.com.cn/baike/detail_8_25_7.html","name":"<span class='hs_kw7_baikeph'></span>(EBD/CBC等)"},{"id":9,"link":"https://car.autohome.com.cn/baike/detail_8_25_9.html","name":"刹车辅助(EBA/BAS/BA等)"},{"id":10,"link":"https://car.autohome.com.cn/baike/detail_8_25_10.html","name":"<span class='hs_kw73_baikeph'></span>(ASR/TCS/TRC等)"},{"id":11,"link":"https://car.autohome.com.cn/baike/detail_8_25_11.html","name":"车身<span class='hs_kw87_baikeph'></span>控制(ESC/ESP/DSC等)"},{"id":12,"link":"https://car.autohome.com.cn/baike/detail_8_25_12.html","name":"<span class='hs_kw51_baikeph'></span>辅助"},{"id":13,"link":"https://car.autohome.com.cn/baike/detail_8_25_13.html","name":"车道偏离<span class='hs_kw75_baikeph'></span>系统"},{"id":14,"link":"https://car.autohome.com.cn/baike/detail_8_25_14.html","name":"<span class='hs_kw85_baikeph'></span>刹车/<span class='hs_kw85_baikeph'></span>安全系统"},{"id":15,"link":"https://car.autohome.com.cn/baike/detail_8_25_15.html","name":"夜视系统"},{"id":16,"link":"https://car.autohome.com.cn/baike/detail_8_25_16.html","name":"道路交通标识识别"},{"id":17,"link":"https://car.autohome.com.cn/baike/detail_8_25_17.html","name":"疲劳<span class='hs_kw83_baikeph'></span>提示"},{"id":18,"link":"https://car.autohome.com.cn/baike/detail_8_25_18.html","name":"ISOFIX<span class='hs_kw77_baikeph'></span><span class='hs_kw41_baikeph'></span>"},{"id":19,"link":"https://car.autohome.c
某车之家车型数据爬虫代码及插件.zip
需积分: 5 120 浏览量
2019-06-25
10:04:52
上传
评论 4
收藏 6.8MB ZIP 举报
dsczijizuo
- 粉丝: 13
- 资源: 2