import os
import re
import codecs
import json
import sys
from scrapy import Spider
from scrapy.selector import Selector
from lagou.items import LagouItem
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
class TencentSpider(Spider):
name = "lagou"
#allowed_domains = ["lagou.com"]
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_java?',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest'
},
"ITEM_PIPELINES": {
'lagou.pipelines.LagouPipeline': 300
}
}
def start_requests(self):
#修改city参数更换城市
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0&city=广州"
requests = []
for i in range(1, 60):
#修改kd参数更换关键字
formdata = {'first':'false', 'pn':str(i), 'kd':'java'}
request = FormRequest(url, callback=self.parse_model, formdata=formdata)
requests.append(request)
print(request)
return requests
def parse_model(self, response):
print(response.body.decode())
jsonBody = json.loads(response.body.decode())
results = jsonBody['content']['positionResult']['result']
items=[]
for result in results:
item=LagouItem()
item['name']=result['positionName']
item['workLocation']=result['city']
if result['district']:
item['workLocation']+=result['district']
if result['businessZones']:
for zone in result['businessZones']:
item['workLocation'] += zone
#item['catalog']
item['money']=result['salary']
item['demand']=result['workYear']+"/"+result['education']
item['skillLabel']=",".join(result['positionLables'])
item['positionAdvantage']=result['positionAdvantage']
item['publishTime']=result['formatCreateTime']
item['company']=result['companyFullName']
item['companyField']=result['industryField']
item['companyLabelList']=",".join(result['companyLabelList'])
item['detailLink']="https://www.lagou.com/jobs/"+str(result['positionId'])+".html"
item['detailCompany']="https://www.lagou.com/gongsi/"+str(+result['companyId'])+".html"
items.append(item)
return items
使用scrapy框架爬取拉勾网数据
需积分: 50 40 浏览量
2017-09-02
20:03:56
上传
评论 2
收藏 10KB RAR 举报
Mankind_萌凯
- 粉丝: 134
- 资源: 2
最新资源
- 656
- 人工智能课程实验matlab实现遗传算法源代码,其中main为主程序,其它是实现的相关函数
- mysql-connector-c-6.1.11 - x86&x64
- 海尔智能电视刷机数据 LD32U3200 机编DH1RV000201 务必确认机编一致 强制刷机 整机USB升级主程序.zip
- sdflkhskfld;j'
- 物流管理系统项目(源码+演示录像+说明文档+数据库)-springboot+vue毕业设计.zip
- 福禄克5720多功能源表手册,内容包括软件编程手册
- CNNxxxxxxxxx
- 2024北森题库(含答案)
- vrrp的实验,如vrrp的负载分担,vrrp的主备链路,vrrp联动(上行链路波动)
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈