import os
import time
import random
import pandas as pd
import numpy as np
from playwright.sync_api import Playwright, sync_playwright
def name_file(name):
ix = 0
while True:
filename = f'{name}_{ix}.xlsx'
if os.path.exists(filename):
ix += 1
else:
return filename
def get_new_page_info(context, Locator):
with context.expect_page() as new_page_info:
Locator.click()
new_page = new_page_info.value
new_page.wait_for_load_state()
new_page.set_default_timeout(1000)
if '滑动' in new_page.title():
sliding_path(new_page)
new_page.wait_for_load_state()
position_name = new_page.locator('xpath=/html/body/section[3]/div[1]/div[1]/span[1]').text_content()
try:
job_company = new_page.locator('xpath=/html/body/main/aside/div[3]').text_content()
except:
job_company = ''
job_request = new_page.locator('xpath=/html/body/section[3]/div[1]/div[2]').text_content()
salary = new_page.locator('xpath=/html/body/section[3]/div[1]/div[1]/span[2]').text_content()
position_label = new_page.locator('xpath=/html/body/section[4]/div/div[1]').text_content()
content = new_page.locator('xpath=/html/body/main/content/section[2]').text_content()
new_page.close()
return [position_name, job_company, job_request, salary, position_label, content]
def sliding_path(page):
# 定义滑块和包含容器
slider_box = page.locator('xpath=//*[@id="nc_1_n1z"]').bounding_box()
contain_box = page.locator('xpath=//*[@id="nc_1__scale_text"]/span').bounding_box()
distance = contain_box['width']
page.mouse.move(x=int(slider_box['x']), y=slider_box['y'] + slider_box['height'] / 2)
page.mouse.down()
size = 1000
scale = 3
tolerance = distance * 0.2
# 超过
lst = np.linspace(0, distance + tolerance, size) + np.random.normal(size=size, scale=scale)
ix = np.array([i ** 2 for i in range(1, int(size ** 0.5 + 1))]) - 1
move_list = lst[ix]
for move in move_list:
page.mouse.move(x=int(slider_box['x']) + move, y=slider_box['y'] + slider_box['height'] / 2, steps=3)
size = 100
scale = 10
# 返回
lst = np.linspace(move_list[-1], 300, size) + np.random.normal(size=size, scale=scale)
ix = np.array([i ** 2 for i in range(1, int(size ** 0.5 + 1))]) - 1
move_list = lst[ix]
for move in move_list:
page.mouse.move(x=int(slider_box['x']) + move, y=slider_box['y'] + slider_box['height'] / 2, steps=10)
page.mouse.move(x=int(slider_box['x']) + 300, y=slider_box['y'] + slider_box['height'] / 2, steps=3)
page.mouse.up()
page.wait_for_load_state()
def run(playwright: Playwright) -> None:
browser = playwright.chromium.connect_over_cdp('http://localhost:6568')
context = browser.contexts[0]
page = context.pages[0]
info_list = []
try:
for i in range(30):
Locators = page.locator('xpath=//*[@id="lp-search-job-box"]/div[3]/section[1]/div[1]/div/div/div[1]/div/a/div[1]/div')
for Locator in Locators.all():
info = get_new_page_info(context, Locator)
time.sleep(0.2)
print(info)
info_list.append(info)
page.locator('xpath=//span[@aria-label="right"]').click()
time.sleep(1)
page.wait_for_load_state()
except Exception as e:
print(e)
df = pd.DataFrame(info_list, columns=['position_name', 'job_company', 'job_request', 'salary', 'position_label', 'content'])
df.to_excel(name_file('猎聘'), index=False)
# 测试
# locator = page.locator('xpath=//*[@id="lp-search-job-box"]/div[3]/section[1]/div[1]/div/div/div[1]/div/a/div[1]/div').nth(1)
# info = get_new_page_info(context, locator)
# print(info)
with sync_playwright() as playwright:
run(playwright)
没有合适的资源?快使用搜索试试~ 我知道了~
playwright自动化 + 招聘数据
共35个文件
xlsx:28个
py:6个
ipynb:1个
需积分: 0 1 下载量 116 浏览量
2024-03-12
12:53:16
上传
评论
收藏 9.15MB ZIP 举报
温馨提示
playwright自动化获取招聘数据
资源推荐
资源详情
资源评论
收起资源包目录
playwright自动化.zip (35个子文件)
playwright自动化
智联.py 2KB
前程.py 4KB
国家大学生就业服务平台_10.xlsx 234KB
猎聘.py 4KB
国家大学生就业服务平台_12.xlsx 27KB
招聘信息汇总.xlsx 2.3MB
智联_0.xlsx 167KB
main.ipynb 0B
国家大学生就业服务平台_4.xlsx 358KB
智联_3.xlsx 248KB
猎聘_1.xlsx 693KB
国家大学生就业服务平台_3.xlsx 65KB
智联_1.xlsx 103KB
国家大学生就业服务平台_8.xlsx 215KB
国家大学生就业服务平台_6.xlsx 552KB
国家大学生就业服务平台.py 4KB
国家大学生就业服务平台_2.xlsx 24KB
国家大学生就业服务平台_1.xlsx 311KB
国家大学生就业服务平台_9.xlsx 541KB
拉钩.py 2KB
拉钩_1.xlsx 163KB
数据
拉钩.xlsx 313KB
猎聘.xlsx 807KB
智联.xlsx 429KB
前程.xlsx 180KB
智联_2.xlsx 36KB
前程_0.xlsx 188KB
国家大学生就业服务平台_7.xlsx 123KB
国家大学生就业服务平台_11.xlsx 5KB
test.py 426B
猎聘_0.xlsx 158KB
拉钩_2.xlsx 332KB
拉钩_0.xlsx 160KB
国家大学生就业服务平台_5.xlsx 521KB
国家大学生就业服务平台_0.xlsx 161KB
共 35 条
- 1
资源评论
Bigcrab__
- 粉丝: 3003
- 资源: 8
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功