# -*- coding: utf-8 -*-
import requests
import scrapy
import re
import json
from lxml import etree
from hdf.items import DoctorItem, DiseaseItem, ErrorItem
class HdfSpiderSpider(scrapy.Spider):
with open('disease.txt', 'r') as f:
name = [i.strip() for i in f.readlines()][0]
allowed_domains = ['www.haodf.com', 'jiankanghao.haodf.com', 'm.haodf.com']
def errback_http(self, failure):
print("error", failure)
all_item = failure.request.meta.get('all_item')
if all_item:
all_item['sale'] = ''
all_item['like_num'] = ''
all_item['buy_price'] = ''
all_item['online_num'] = ''
all_item['hot_num'] = ''
yield all_item
doctor_item = failure.request.meta.get('doctor_item')
if doctor_item:
doctor_item['hospital_lever'] = ''
yield doctor_item
request = failure.request
error_item = ErrorItem()
error_item['error_method'] = 'errback_http'
error_item['error_content'] = '{}'.format(failure)
error_item['error_url'] = '{}'.format(request.url)
error_item['error_response'] = ''
yield error_item
def start_requests(self):
disease_ls = [self.name]
if 'ke' in self.name:
for disease in disease_ls:
headers = {
'Host': 'www.haodf.com',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
# 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
# 'Referer': 'https://www.haodf.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
url = 'https://www.haodf.com/doctor/list-all-{}.html?p=1'.format(disease)
yield scrapy.Request(
url=url,
callback=self.parse,
headers=headers,
errback=self.errback_http,
meta={'disease': disease}
)
else:
for disease in disease_ls:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.haodf.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
url = 'https://www.haodf.com/citiao/jibing-{}/tuijian-doctor.html'.format(disease)
yield scrapy.Request(
url=url,
callback=self.parse,
headers=headers,
errback=self.errback_http,
meta={'disease': disease}
)
def parse(self, response):
disease = response.meta.get('disease')
try:
if 'ke' in disease:
try:
html = response.body.decode('gb2312')
except:
print(response.request.url)
html = response.body.decode('gbk')
html = etree.HTML(html)
doc_list = html.xpath('//div[@class="d-doc-list"]/ul/li')
for li in doc_list:
doctor_name = li.xpath('.//span[@class="doc-name"]/a/text()')[0].strip()
# print(doctor_name)
doctor_id = re.search(r'/(\d+)\.html$', li.xpath('.//span[@class="doc-name"]/a/@href')[0].strip()).group(1)
print('获取->{}<-医生:'.format(self.name), doctor_id, doctor_name)
url2 = 'https://www.haodf.com/ndoctor/ajaxGetSrvDocInfo?doctorId={}'.format(doctor_id)
headers2 = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'if-none-match': 'W/"1ce-lpqDXO1HihbefSgpyc5PYA"',
'referer': 'https://www.haodf.com/doctor/{}/fuwu-wenzhen.html'.format(doctor_id),
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
}
yield scrapy.Request(
url=url2,
callback=self.parse_item,
errback=self.errback_http,
headers=headers2,
meta={'disease': disease, 'doctor_id': doctor_id, 'doctor_name': doctor_name}
)
next_page = response.xpath('//div[@class="page_turn"]/a[contains(string(), "下一页")]/@href').extract_first()
if next_page:
page_num = int(re.search(r'\d+', next_page).group())
if page_num > 100:
return
url2 = 'https://www.haodf.com' + next_page
headers2 = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.haodf.com',
# 'Referer': 'https://www.haodf.com/citiao/jibing-{}/tuijian-doctor.html?p={}'.format(disease, i-1),
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
# 'Cookie': '__jsluid_s=e9de4106b3ad37ddef0d7335a9533810; g=66247_1626181968043; CNZZDATA-FE=CNZZDATA-FE; g=HDF.183.60ed91502cd78; Hm_lvt_dfa5478034171cc641b1639b2a5b717d=1638847995; Hm_lpvt_dfa5478034171cc641b1639b2a5b717d=1638849256'
}
yield scrapy.Request(
url2,
callback=self.parse,
errback=self.errback_http,
- 1
- 2
前往页