import scrapy
from lxml import etree
class Tc3Spider(scrapy.Spider):
name = "tc3"
allowed_domains = ["zz.58.com"]
start_urls = ["https://zz.58.com/quanzhizhaopin/?key=%E5%90%8E%E7%AB%AF%E5%BC%80%E5%8F%91&classpolicy=strategy%2Cuuid_939eb358a0a14d338d323bb1480419b1%2Cdisplocalid_342%2Cfrom_674%2Cto_jump%2Ctradeline_job%2Cclassify_E&search_uuid=939eb358a0a14d338d323bb1480419b1&final=1"]
def parse(self, response):
print("=======爬虫结果开始=======")
# text = response.text
parser = etree.HTMLParser(encoding='utf-8')
text = etree.parse('s58test.html', parser=parser)
if text.find("访问过于频繁") == -1:
print("访问过于频繁,等5-10分钟后再试")
else:
list_con = text.xpath('//*[@id="list_con"]/li')
_id = 0
for con in list_con:
_id += 1
print(f"【第{_id}份工作信息】:")
job_name = con.xpath('.//div[@class="job_name clearfix"]/a/text()') # 获取工作名称
print("【工作名称】:", str(job_name[0]).replace('\r', '').replace('\n', '').replace(' ', '')) # 去除换行符和回车符,以及多余空格
job_salary = con.xpath('.//p[@class="job_salary"]/text()') # 获取工作薪水
print("【工作薪水】:", job_salary[0])
job_wel_list = con.xpath('.//div[@class="job_wel clearfix"]/span') # 获取工作标签
job_wel = ""
for job_wel_item in job_wel_list:
job_wel += str(job_wel_item.text).strip() + " "
print("【工作标签】:", job_wel)
comp_name = con.xpath('.//div[@class="comp_name"]/a/text()') # 获取招聘公司名称
print("【招聘公司名称】:", str(comp_name[0]).replace('\r', '').replace('\n', '').replace(' ', '')) # 去除换行符和回车符,以及多余空格
job_require_list = con.xpath('.//p[@class="job_require"]/span') # 获取招聘要求
job_require = ""
for job_require_item in job_require_list:
job_require += str(job_require_item.text).strip() + " "
print("【招聘要求】:", job_require)
print("=======爬虫结果结束=======")