# coding: utf-8
"""
@Time : 11/7/2022 13:42
@Author: fff
@File: test2.py
@Software: PyCharm
"""
import asyncio,Class_function,random
from playwright.async_api import async_playwright
class spider:
def __init__(self):
self.Core_Function = Class_function.Class_function()
self.page_result_list=[]
self.request_list = []
self.response_list = []
self.list_url = []
self.HTML_list=[]
async def click_function(self,page):
'''
;模拟遍历点击
:return:
'''
try:
num222 = await page.evaluate('''
window.stop();
num222=document.querySelectorAll('*').length;
num222;
''')
# self.Core_Function.callback_logging.info(num222)
#print(num222)
await page.evaluate('''
window.scrollBy(1920, 50);
treeWalker = document.createTreeWalker(document);
num111=0
while (treeWalker.nextNode() && num111<1500) {
console.log("[*] processing node " + treeWalker.currentNode.tagName + ' ' + treeWalker.currentNode.id);
if (treeWalker.currentNode.click) {
treeWalker.currentNode.target='';
treeWalker.currentNode.click();
num111=num111+1;
}
}
''')
flag_num=0
while await page.evaluate('num111') < 666:
#self.Core_Function.callback_logging().info('while')
await asyncio.sleep(0.5)
flag_num=flag_num+1
if flag_num<3:
break
except Exception as e:
self.Core_Function.callback_logging().error(e)
async def hook_requset(self,route):
'''
# hook 请求包
:param route:
:return:
'''
# print(route.request.url)
if route.request.url == self.target_request['url']:
if self.target_request['body'] != 'Null':
await route.continue_(headers=self.target_request['headers'], method=self.target_request['method'],
post_data=self.target_request['body'])
elif self.target_request['method'] == 'GET':
await route.continue_(headers=self.target_request['headers'], method=self.target_request['method'])
else:
await route.continue_()
elif route.request.url != 'about:blank' and route.request.is_navigation_request():
# print(route.request.url)
if route.request.method == 'GET':
request = {"headers": route.request.headers, "method": route.request.method, "url": route.request.url,
"body": "Null"}
self.request_list.append(request)
elif self.target_request['body'] != 'Null':
request = {"headers": route.request.headers, "method": route.request.method,
"url": route.request.url, "body": route.request.post_data}
self.request_list.append(request)
# await route.continue_()
await route.abort(error_code='aborted')
else:
await route.continue_()
async def handle_popup(self, page):
'''
# 关闭click打开的新窗口
:param page:
:return:
'''
await page.close()
async def handle_dialog(self, dialog):
'''
# 处理alert之类
:param dialog:
:return:
'''
await dialog.dismiss()
async def handle_network_http_request(self, request):
'''
; 获取页面http请求
:param request:
:return:
'''
try:
if request.resource_type not in ['image', 'stylesheet', 'websocket', 'media', 'font']:
request_data = {}
# print(request.url)
if request.post_data == None:
request_data['body'] = 'Null'
else:
request_data['body'] = request.post_data
request_data['url'] = request.url
request_data['headers'] = request.headers
request_data['method'] = request.method
request_data['time'] = self.Core_Function.callback_time(0)
request_data['describe'] = 'Null'
request_data['status'] = 0
#print(request_data)
self.request_list.append(request_data)
except Exception as e:
self.Core_Function.callback_logging().error(e)
async def handle_http_response(self, response):
'''
# 处理http响应
'''
try:
# print(response.request.url)
response_data = {}
html_data={}
if response.request.url == self.target_request['url']:
if response.status in [200, 301, 302, 404, 500]:
response_data['body'] = self.target_request['body']
response_data['url'] = self.target_request['url']
response_data['headers'] = self.target_request['headers']
response_data['method'] = self.target_request['method']
response_data['http_status_code'] = response.status
response_data['headers_response'] = response.headers
html=await response.text()
html_md5=self.Core_Function.md5_convert(html)
html_data['html']=html
html_data['html_md5'] = html_md5
html_data['time'] = self.Core_Function.callback_time(0)
html_data['status']=0
response_data['html_md5']=html_md5
response_data['time'] = self.Core_Function.callback_time(0)
response_data['describe'] = 'Null'
response_data['status'] = 0
# print(response_data)
self.response_list.append(response_data)
self.HTML_list.append(html_data)
except Exception as e:
self.Core_Function.callback_logging().error(e)
async def page_data(self,page,request):
'''
;页面信息获取
:param page:
:param request:
:return:
'''
html=await page.content()
html_md5 = self.Core_Function.md5_convert(html)
html_data={}
html_data['html'] = html
html_data['html_md5'] = html_md5
html_data['time'] = self.Core_Function.callback_time(0)
html_data['status'] = 0
request['status'] = 0
request['html_md5'] = html_md5
request['title'] = await page.title()
print(request['title'])
await page.evaluate('''
list_href=[]
window.open = function(url) { console.log("new link: " + url);list_href.push(url); };
window.close = function () { return false; };
''')
await page.evaluate('''
list_href=[]
for(i=0;i<document.getElementsByTagName("a").length;i++){
list_href.push(document.getElementsByTagName("a")[i].href); //输出该页面的所有链接。
}
''')
# print(await page.content())
filename_img = "%s/%s.jpeg" % (self.Core_Function.create_image_path(), ''.join(
random.sample('ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678zyxwvutsrqponmlkjihgfedcba', 15)))
try:
await page.screenshot(path=filename_img, type='jpeg', quality=15)
except Exception as error:
filename_img
没有合适的资源?快使用搜索试试~ 我知道了~
一个帅气的py爬虫模块 环境windows11+Ubuntu+"python3.10+"+playwright
共2个文件
py:2个
需积分: 1 3 下载量 117 浏览量
2023-01-10
17:29:47
上传
评论
收藏 7KB ZIP 举报
温馨提示
其实各种爬虫已经很多了,无论动态静态爬虫,基于http请求正则,还是基于浏览器,说大同小异可以有点夸张,但是好像都基于一个基础"URL",但在web安全领域,"URL"显得有些不够全面吧。 我认为一个优秀的爬虫是依据http请求,http请求包括"GET、POST、HEAD、OPTIONS、PUT、PATCH、DELETE、TRACE、CONNECT",但是用浏览器上重放post之类的请求就很麻烦吧,所以本脚本就解决了这个问题,在实际需求中如post xss,还有post之后才显示敏感信息,都需要这个功能。 动态爬虫的并发问题,我是通过打开多chrome tab来实现异步并发,可能和搞个浏览器池多进程多线程比有点慢,但是足够我的需求了。 还有收集足够的页面信息,这点就是大同小异。 反爬虫监控。 页面的动态操作。 代码从我扫描器里扒下来的,可能有几个小报错,自己修修改改下,反正核心功能就是基于playwright的爬虫,不行就查api,https://playwright.dev/python/docs/pages
资源推荐
资源详情
资源评论
收起资源包目录
spider.zip (2个子文件)
spider.py 21KB
Class_function.py 8KB
共 2 条
- 1
资源评论
Wis57
- 粉丝: 304
- 资源: 450
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功