【免费】playwright自动化+招聘数据_playwright._impl.

共35个文件

xlsx：28个

py：6个

ipynb：1个

需积分: 0 116 浏览量 2024-03-12 12:53:16 上传评论收藏 9.15MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

playwright自动化.zip （35个子文件）

playwright自动化

智联.py 2KB

前程.py 4KB

国家大学生就业服务平台_10.xlsx 234KB

猎聘.py 4KB

国家大学生就业服务平台_12.xlsx 27KB

招聘信息汇总.xlsx 2.3MB

智联_0.xlsx 167KB

main.ipynb 0B

国家大学生就业服务平台_4.xlsx 358KB

智联_3.xlsx 248KB

猎聘_1.xlsx 693KB

国家大学生就业服务平台_3.xlsx 65KB

智联_1.xlsx 103KB

国家大学生就业服务平台_8.xlsx 215KB

国家大学生就业服务平台_6.xlsx 552KB

国家大学生就业服务平台.py 4KB

国家大学生就业服务平台_2.xlsx 24KB

国家大学生就业服务平台_1.xlsx 311KB

国家大学生就业服务平台_9.xlsx 541KB

拉钩.py 2KB

拉钩_1.xlsx 163KB

数据

拉钩.xlsx 313KB

猎聘.xlsx 807KB

智联.xlsx 429KB

前程.xlsx 180KB

智联_2.xlsx 36KB

前程_0.xlsx 188KB

国家大学生就业服务平台_7.xlsx 123KB

国家大学生就业服务平台_11.xlsx 5KB

test.py 426B

猎聘_0.xlsx 158KB

拉钩_2.xlsx 332KB

拉钩_0.xlsx 160KB

国家大学生就业服务平台_5.xlsx 521KB

国家大学生就业服务平台_0.xlsx 161KB

import os import time import random import pandas as pd import numpy as np from playwright.sync_api import Playwright, sync_playwright def name_file(name): ix = 0 while True: filename = f'{name}_{ix}.xlsx' if os.path.exists(filename): ix += 1 else: return filename def get_new_page_info(context, Locator): with context.expect_page() as new_page_info: Locator.click() new_page = new_page_info.value new_page.wait_for_load_state() new_page.set_default_timeout(1000) if '滑动' in new_page.title(): sliding_path(new_page) new_page.wait_for_load_state() position_name = new_page.locator('xpath=/html/body/section[3]/div[1]/div[1]/span[1]').text_content() try: job_company = new_page.locator('xpath=/html/body/main/aside/div[3]').text_content() except: job_company = '' job_request = new_page.locator('xpath=/html/body/section[3]/div[1]/div[2]').text_content() salary = new_page.locator('xpath=/html/body/section[3]/div[1]/div[1]/span[2]').text_content() position_label = new_page.locator('xpath=/html/body/section[4]/div/div[1]').text_content() content = new_page.locator('xpath=/html/body/main/content/section[2]').text_content() new_page.close() return [position_name, job_company, job_request, salary, position_label, content] def sliding_path(page): # 定义滑块和包含容器 slider_box = page.locator('xpath=//*[@id="nc_1_n1z"]').bounding_box() contain_box = page.locator('xpath=//*[@id="nc_1__scale_text"]/span').bounding_box() distance = contain_box['width'] page.mouse.move(x=int(slider_box['x']), y=slider_box['y'] + slider_box['height'] / 2) page.mouse.down() size = 1000 scale = 3 tolerance = distance * 0.2 # 超过 lst = np.linspace(0, distance + tolerance, size) + np.random.normal(size=size, scale=scale) ix = np.array([i ** 2 for i in range(1, int(size ** 0.5 + 1))]) - 1 move_list = lst[ix] for move in move_list: page.mouse.move(x=int(slider_box['x']) + move, y=slider_box['y'] + slider_box['height'] / 2, steps=3) size = 100 scale = 10 # 返回 lst = np.linspace(move_list[-1], 300, size) + np.random.normal(size=size, scale=scale) ix = np.array([i ** 2 for i in range(1, int(size ** 0.5 + 1))]) - 1 move_list = lst[ix] for move in move_list: page.mouse.move(x=int(slider_box['x']) + move, y=slider_box['y'] + slider_box['height'] / 2, steps=10) page.mouse.move(x=int(slider_box['x']) + 300, y=slider_box['y'] + slider_box['height'] / 2, steps=3) page.mouse.up() page.wait_for_load_state() def run(playwright: Playwright) -> None: browser = playwright.chromium.connect_over_cdp('http://localhost:6568') context = browser.contexts[0] page = context.pages[0] info_list = [] try: for i in range(30): Locators = page.locator('xpath=//*[@id="lp-search-job-box"]/div[3]/section[1]/div[1]/div/div/div[1]/div/a/div[1]/div') for Locator in Locators.all(): info = get_new_page_info(context, Locator) time.sleep(0.2) print(info) info_list.append(info) page.locator('xpath=//span[@aria-label="right"]').click() time.sleep(1) page.wait_for_load_state() except Exception as e: print(e) df = pd.DataFrame(info_list, columns=['position_name', 'job_company', 'job_request', 'salary', 'position_label', 'content']) df.to_excel(name_file('猎聘'), index=False) # 测试 # locator = page.locator('xpath=//*[@id="lp-search-job-box"]/div[3]/section[1]/div[1]/div/div/div[1]/div/a/div[1]/div').nth(1) # info = get_new_page_info(context, locator) # print(info) with sync_playwright() as playwright: run(playwright)

评论收藏

内容反馈