# -*- coding: utf-8 -*-
# @author 张函仑
# version 1.0
# 爬取51job关于软件开发的职位信息
import scrapy
from HRInformation.items import HrinformationItem
class A51jobhrSpider(scrapy.Spider):
name = '51jobHR'
allowed_domains = ['51job.com']
start_urls = ['http://search.51job.com/jobsearch/search_result.php?fromJs=1&industrytype=32%2C01%2C40&keyword=%E5%BC%80%E5%8F%91&keywordtype=2&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9']
def parse(self, response):
for x in range(4,54):
str1=str(x)
node_xpath = '//*[@id="resultList"]/div['+str1+']'
item = HrinformationItem()
# 职位名称
item['position_name'] = response.xpath(node_xpath + '/p/span/a/@title').extract()[0]
# 职位类型
item['position_type'] = '技术类'
# 职位信息链接
item['position_link'] = response.xpath(node_xpath + '/p/span/a/@href').extract()[0]
# 公司名称
item['company_name'] = response.xpath(node_xpath + '/span[1]/a/text()').extract()[0]
# 工作地点
item['work_location'] = response.xpath(node_xpath + '/span[2]/text()').extract()[0]
# 薪资水平
if (len(response.xpath(node_xpath + '/span[3]/text()').extract()) != 0):
item['salary'] = response.xpath(node_xpath + '/span[3]/text()').extract()[0]
else:
item['salary'] = ""
# 发布时间
item['publish_time'] = response.xpath(node_xpath + '/span[4]/text()').extract()[0]
item['publish_time'] = '2018-' + item['publish_time']
item['position_attribute'] = '社会招聘'
second_url = item['position_link']
yield scrapy.Request(second_url, meta={'item': item}, callback=self.parse_second)
# 循环抓取页面
if (len(response.xpath('//*[@id="resultList"]/div[55]/div/div/div/ul/li[8]/a/@href').extract()) != 0):
next_url = response.xpath('//*[@id="resultList"]/div[55]/div/div/div/ul/li[8]/a/@href').extract()[0]
print(next_url)
yield scrapy.Request(next_url, callback=self.parse)
def parse_second(self,response):
item = response.meta['item']
second_node_PNumber_Xpath = "/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span[3]/text()"
second_node_PInformation_Xpath = "/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()"
second_node_PInformation_Xpath1 = '/html/body/div[3]/div[2]/div[3]/div[2]/div/text()'
item['people_number'] = response.xpath(second_node_PNumber_Xpath).extract()[0]
people_number=item['people_number'][1]+item['people_number'][2]
if people_number == '若干':
item['people_number'] = '10'
elif people_number[1] == '人':
item['people_number'] = people_number[0]
elif people_number[1]=='-':
item['people_number'] = people_number[0]
else:
item['people_number'] = people_number
item['company_type'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()').extract()[0]
CompanyType = item['company_type'][6] + item['company_type'][7]
item['company_type'] = CompanyType
item['position_information'] = response.xpath(second_node_PInformation_Xpath).extract()
if len(item['position_information']) == 0:
item['position_information'] = response.xpath(second_node_PInformation_Xpath1).extract()
yield item
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
HRInformation.rar (21个子文件)
HRInformation
HRInformation
tencent_begin.py 79B
middlewares.py 2KB
spiders
ZhiLianHR.py 3KB
__init__.py 161B
__pycache__
a51jobHR.cpython-36.pyc 3KB
ZhiLianHR.cpython-36.pyc 2KB
__init__.cpython-36.pyc 147B
TencentHR.cpython-36.pyc 2KB
a51jobHR.py 4KB
TencentHR.py 2KB
__init__.py 0B
pipelines.py 2KB
51job_begin.py 77B
zhilianHR_begin.py 79B
__pycache__
items.cpython-36.pyc 611B
pipelines.cpython-36.pyc 2KB
__init__.cpython-36.pyc 139B
settings.cpython-36.pyc 337B
settings.py 3KB
items.py 870B
scrapy.cfg 270B
共 21 条
- 1
资源评论
CodeWorkerZHL
- 粉丝: 46
- 资源: 9
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功