# -*- coding: utf-8 -*-
import requests
import scrapy
import re
import json
from lxml import etree
from hdf.items import DoctorItem, DiseaseItem, ErrorItem
class HdfSpiderSpider(scrapy.Spider):
with open('disease.txt', 'r') as f:
name = [i.strip() for i in f.readlines()][0]
allowed_domains = ['www.haodf.com', 'jiankanghao.haodf.com', 'm.haodf.com']
def errback_http(self, failure):
print("error", failure)
all_item = failure.request.meta.get('all_item')
if all_item:
all_item['sale'] = ''
all_item['like_num'] = ''
all_item['buy_price'] = ''
all_item['online_num'] = ''
all_item['hot_num'] = ''
yield all_item
doctor_item = failure.request.meta.get('doctor_item')
if doctor_item:
doctor_item['hospital_lever'] = ''
yield doctor_item
request = failure.request
error_item = ErrorItem()
error_item['error_method'] = 'errback_http'
error_item['error_content'] = '{}'.format(failure)
error_item['error_url'] = '{}'.format(request.url)
error_item['error_response'] = ''
yield error_item
def start_requests(self):
disease_ls = [self.name]
if 'ke' in self.name:
for disease in disease_ls:
headers = {
'Host': 'www.haodf.com',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
# 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
# 'Referer': 'https://www.haodf.com/',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
url = 'https://www.haodf.com/doctor/list-all-{}.html?p=1'.format(disease)
yield scrapy.Request(
url=url,
callback=self.parse,
headers=headers,
errback=self.errback_http,
meta={'disease': disease}
)
else:
for disease in disease_ls:
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.haodf.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
url = 'https://www.haodf.com/citiao/jibing-{}/tuijian-doctor.html'.format(disease)
yield scrapy.Request(
url=url,
callback=self.parse,
headers=headers,
errback=self.errback_http,
meta={'disease': disease}
)
def parse(self, response):
disease = response.meta.get('disease')
try:
if 'ke' in disease:
try:
html = response.body.decode('gb2312')
except:
print(response.request.url)
html = response.body.decode('gbk')
html = etree.HTML(html)
doc_list = html.xpath('//div[@class="d-doc-list"]/ul/li')
for li in doc_list:
doctor_name = li.xpath('.//span[@class="doc-name"]/a/text()')[0].strip()
# print(doctor_name)
doctor_id = re.search(r'/(\d+)\.html$', li.xpath('.//span[@class="doc-name"]/a/@href')[0].strip()).group(1)
print('获取->{}<-医生:'.format(self.name), doctor_id, doctor_name)
url2 = 'https://www.haodf.com/ndoctor/ajaxGetSrvDocInfo?doctorId={}'.format(doctor_id)
headers2 = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'if-none-match': 'W/"1ce-lpqDXO1HihbefSgpyc5PYA"',
'referer': 'https://www.haodf.com/doctor/{}/fuwu-wenzhen.html'.format(doctor_id),
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
}
yield scrapy.Request(
url=url2,
callback=self.parse_item,
errback=self.errback_http,
headers=headers2,
meta={'disease': disease, 'doctor_id': doctor_id, 'doctor_name': doctor_name}
)
next_page = response.xpath('//div[@class="page_turn"]/a[contains(string(), "下一页")]/@href').extract_first()
if next_page:
page_num = int(re.search(r'\d+', next_page).group())
if page_num > 100:
return
url2 = 'https://www.haodf.com' + next_page
headers2 = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.haodf.com',
# 'Referer': 'https://www.haodf.com/citiao/jibing-{}/tuijian-doctor.html?p={}'.format(disease, i-1),
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
# 'Cookie': '__jsluid_s=e9de4106b3ad37ddef0d7335a9533810; g=66247_1626181968043; CNZZDATA-FE=CNZZDATA-FE; g=HDF.183.60ed91502cd78; Hm_lvt_dfa5478034171cc641b1639b2a5b717d=1638847995; Hm_lpvt_dfa5478034171cc641b1639b2a5b717d=1638849256'
}
yield scrapy.Request(
url2,
callback=self.parse,
errback=self.errback_http,
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
基于知识图谱的医生推荐系统python实现源码+数据(高分毕业设计).zip该项目属于个人毕业设计,经导师的精心指导与严格评审获得高分通过的设计项目。主要针对计算机相关专业的教师、正在做毕设、课设的学生使用,也可作为项目实战演练,可直接作为课程设计、期末大作业、毕设等。 【1】项目代码完整且功能都验证ok,确保稳定可靠运行后才上传。欢迎下载使用!在使用过程中,如有问题或建议,请及时私信沟通,帮助解答。 【2】项目主要针对各个计算机相关专业,包括计科、信息安全、数据科学与大数据技术、人工智能、通信、物联网等领域的在校学生、专业教师或企业员工使用。 【3】项目具有较高的学习借鉴价值,不仅适用于小白学习入门进阶。也可作为毕设项目、课程设计、大作业、初期项目立项演示等。 【4】如果基础还行,或热爱钻研,可基于此项目进行二次开发,DIY其他不同功能,欢迎交流学习。 【备注】 项目下载解压后,项目名字和项目路径不要用中文,否则可能会出现解析不了的错误,建议解压重命名为英文名字后再运行!有问题私信沟通,祝顺利! 基于知识图谱的医生推荐系统python实现源码+数据(高分毕业设计).zip该项目属于个人毕业设计,经导师的精心指导与严格评审获得高分通过的设计项目。主要针对计算机相关专业的教师、正在做毕设、课设的学生使用,也可作为项目实战演练,可直接作为课程设计、期末大作业、毕设等。 基于知识图谱的医生推荐系统python实现源码+数据(高分毕业设计).zip该项目属于个人毕业设计,经导师的精心指导与严格评审获得高分通过的设计项目。主要针对计算机相关专业的教师、正在做毕设、课设的学生使用,也可作为项目实战演练,可直接作为课程设计、期末大作业、毕设等。
资源推荐
资源详情
资源评论
收起资源包目录
基于知识图谱的医生推荐系统python实现源码+数据(高分毕业设计).zip (458个子文件)
scrapy.cfg 249B
scrapy.cfg 249B
scrapy.cfg 249B
scrapy.cfg 249B
box.css 2KB
box.css 2KB
box.css 2KB
box.css 2KB
index.css 257B
index.css 257B
index.css 257B
index.css 257B
disease_gaoxueya.csv 99.92MB
disease_gaoxueya.csv 99.92MB
disease_gaoxueya.csv 99.92MB
disease_gaoxueya.csv 99.92MB
gaoxueya-30992.csv 2.64MB
gaoxueya-30992.csv 2.64MB
gaoxueya-30992.csv 2.64MB
gaoxueya-30992.csv 2.64MB
doctors_gaoxueya.csv 2.42MB
doctors_gaoxueya.csv 2.42MB
doctors_gaoxueya.csv 2.42MB
doctors_gaoxueya.csv 2.42MB
gaoxueya-1.csv 163KB
gaoxueya-1.csv 163KB
gaoxueya-1.csv 163KB
gaoxueya-1.csv 163KB
data16.csv 328B
data16.csv 328B
data16.csv 328B
data16.csv 328B
.gitignore 176B
.gitignore 176B
.gitignore 176B
.gitignore 176B
.gitignore 176B
.gitignore 176B
.gitignore 176B
.gitignore 176B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
.gitignore 47B
news.html 12KB
news.html 12KB
news.html 12KB
news.html 12KB
single.html 12KB
single.html 12KB
single.html 12KB
single.html 12KB
index.html 12KB
index.html 12KB
index.html 12KB
index.html 12KB
gallery.html 9KB
gallery.html 9KB
gallery.html 9KB
gallery.html 9KB
contact.html 7KB
contact.html 7KB
contact.html 7KB
contact.html 7KB
services.html 5KB
services.html 5KB
services.html 5KB
services.html 5KB
fifa16.html 3KB
fifa16.html 3KB
fifa16.html 3KB
fifa16.html 3KB
提交代码.iml 552B
提交代码.iml 552B
提交代码.iml 552B
提交代码.iml 552B
hdf.iml 445B
hdf.iml 445B
hdf.iml 445B
hdf.iml 445B
基于知识图谱的医生推荐系统_爬虫组件.iml 395B
基于知识图谱的医生推荐系统_爬虫组件.iml 395B
基于知识图谱的医生推荐系统_爬虫组件.iml 395B
基于知识图谱的医生推荐系统_爬虫组件.iml 395B
recommend.iml 352B
recommend.iml 352B
39crawler.iml 352B
39crawler.iml 352B
recommend.iml 352B
recommend.iml 352B
39crawler.iml 352B
39crawler.iml 352B
共 458 条
- 1
- 2
- 3
- 4
- 5
资源评论
北航程序员小陈
- 粉丝: 3244
- 资源: 2550
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功