#encoding: utf-8
# pip install bs4
# pip install requests
# sudo pip install xxx
# 显示当前所有安装的库
# pip list
# sudo easy_install pip
import requests
from bs4 import BeautifulSoup
import json
import time
def crawl_detail(id):
url = 'https://www.lagou.com/jobs/%s.html' % id
headers = {
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
req = requests.get(url,headers=headers)
soup = BeautifulSoup(req.content,'lxml')
job_bt = soup.find('dd',attrs={'class':'job_bt'})
return job_bt.text
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': None,
'X-Requested-With': 'XMLHttpRequest'
}
form_data = {
'first': 'true',
'pn': '1',
'kd': 'python'
}
# result = requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',headers=headers,data=form_data)
# json_result = result.json()
# positions = json_result['content']['positionResult']['result']
positions = []
for x in range(1,5):
form_data = {
'first': 'true',
'pn': x,
'kd': 'python'
}
result = requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',headers=headers,data=form_data)
json_result = result.json()
print json_result
print '='*50
page_positions = json_result['content']['positionResult']['result']
for position in page_positions:
# print position
# print '-'*30
# 先把需要的信息拿到,不需要的就不要了
position_dict = {
'position_name': position['positionName'],
'work_year': position['workYear'],
'salary': position['salary'],
'district': position['district'],
'company_name': position['companyFullName'],
}
position_id = position['positionId']
# 拿到这个position,然后再去爬这个职位的详情页面
position_detail = crawl_detail(position_id)
position_dict['position_detail'] = position_detail
positions.append(position_dict)
# 出现您操作太频繁,请稍后再试的解决办法。
# 1. 要么把sleep时间改大一点
# 2. 每次请求不要请求这么多,分多次请求
time.sleep(5)
line = json.dumps(positions,ensure_ascii=False)
with open('lagou.json','w') as fp:
fp.write(line.encode('utf-8'))
if __name__ == '__main__':
main()
# crawl_detail('3265286')
# selenium+PhtonmJS/ChromeDriver