import sys
import asyncio
from urllib.parse import urlparse, urlunparse, urljoin
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures._base import TimeoutError
from functools import partial
from typing import Set, Union, List, MutableMapping, Optional
import pyppeteer
import requests
from pyquery import PyQuery
from fake_useragent import UserAgent
from lxml.html.clean import Cleaner
import lxml
from lxml import etree
from lxml.html import HtmlElement
from lxml.html import tostring as lxml_html_tostring
from lxml.html.soupparser import fromstring as soup_parse
from parse import search as parse_search
from parse import findall, Result
from w3lib.encoding import html_to_unicode
DEFAULT_ENCODING = 'utf-8'
DEFAULT_URL = 'https://example.org/'
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older']
cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
useragent = None
# Typing.
_Find = Union[List['Element'], 'Element']
_XPath = Union[List[str], List['Element'], str, 'Element']
_Result = Union[List['Result'], 'Result']
_HTML = Union[str, bytes]
_BaseHTML = str
_UserAgent = str
_DefaultEncoding = str
_URL = str
_RawHTML = bytes
_Encoding = str
_LXML = HtmlElement
_Text = str
_Search = Result
_Containing = Union[str, List[str]]
_Links = Set[str]
_Attrs = MutableMapping
_Next = Union['HTML', List[str]]
_NextSymbol = List[str]
# Sanity checking.
try:
assert sys.version_info.major == 3
assert sys.version_info.minor > 5
except AssertionError:
raise RuntimeError('Requests-HTML requires Python 3.6+!')
class MaxRetries(Exception):
def __init__(self, message):
self.message = message
class BaseParser:
"""A basic HTML/Element Parser, for Humans.
:param element: The element from which to base the parsing upon.
:param default_encoding: Which encoding to default to.
:param html: HTML from which to base the parsing upon (optional).
:param url: The URL from which the HTML originated, used for ``absolute_links``.
"""
def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
self.element = element
self.url = url
self.skip_anchors = True
self.default_encoding = default_encoding
self._encoding = None
self._html = html.encode(DEFAULT_ENCODING) if isinstance(html, str) else html
self._lxml = None
self._pq = None
@property
def raw_html(self) -> _RawHTML:
"""Bytes representation of the HTML content.
(`learn more <http://www.diveintopython3.net/strings.html>`_).
"""
if self._html:
return self._html
else:
return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding)
@property
def html(self) -> _BaseHTML:
"""Unicode representation of the HTML content
(`learn more <http://www.diveintopython3.net/strings.html>`_).
"""
if self._html:
return self.raw_html.decode(self.encoding)
else:
return etree.tostring(self.element, encoding='unicode').strip()
@html.setter
def html(self, html: str) -> None:
self._html = html.encode(self.encoding)
@raw_html.setter
def raw_html(self, html: bytes) -> None:
"""Property setter for self.html."""
self._html = html
@property
def encoding(self) -> _Encoding:
"""The encoding string to be used, extracted from the HTML and
:class:`HTMLResponse <HTMLResponse>` headers.
"""
if self._encoding:
return self._encoding
# Scan meta tags for charset.
if self._html:
self._encoding = html_to_unicode(self.default_encoding, self._html)[0]
# Fall back to requests' detected encoding if decode fails.
try:
self.raw_html.decode(self.encoding)
except UnicodeDecodeError:
self._encoding = self.default_encoding
return self._encoding if self._encoding else self.default_encoding
@encoding.setter
def encoding(self, enc: str) -> None:
"""Property setter for self.encoding."""
self._encoding = enc
@property
def pq(self) -> PyQuery:
"""`PyQuery <https://pythonhosted.org/pyquery/>`_ representation
of the :class:`Element <Element>` or :class:`HTML <HTML>`.
"""
if self._pq is None:
self._pq = PyQuery(self.html)
return self._pq
@property
def lxml(self) -> HtmlElement:
"""`lxml <http://lxml.de>`_ representation of the
:class:`Element <Element>` or :class:`HTML <HTML>`.
"""
if self._lxml is None:
try:
self._lxml = soup_parse(self.html, features='html.parser')
except ValueError:
self._lxml = lxml.html.fromstring(self.html)
return self._lxml
@property
def text(self) -> _Text:
"""The text content of the
:class:`Element <Element>` or :class:`HTML <HTML>`.
"""
return self.pq.text()
@property
def full_text(self) -> _Text:
"""The full text content (including links) of the
:class:`Element <Element>` or :class:`HTML <HTML>`.
"""
return self.lxml.text_content()
def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find:
"""Given a CSS Selector, returns a list of
:class:`Element <Element>` objects or a single one.
:param selector: CSS Selector to use.
:param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
:param containing: If specified, only return elements that contain the provided text.
:param first: Whether or not to return just the first result.
:param _encoding: The encoding format.
Example CSS Selectors:
- ``a``
- ``a.someClass``
- ``a#someID``
- ``a[target=_blank]``
See W3School's `CSS Selectors Reference
<https://www.w3schools.com/cssref/css_selectors.asp>`_
for more details.
If ``first`` is ``True``, only returns the first
:class:`Element <Element>` found.
"""
# Convert a single containing into a list.
if isinstance(containing, str):
containing = [containing]
encoding = _encoding or self.encoding
elements = [
Element(element=found, url=self.url, default_encoding=encoding)
for found in self.pq(selector)
]
if containing:
elements_copy = elements.copy()
elements = []
for element in elements_copy:
if any([c.lower() in element.full_text.lower() for c in containing]):
elements.append(element)
elements.reverse()
# Sanitize the found HTML.
if clean:
elements_copy = elements.copy()
elements = []
for element in elements_copy:
element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
elements.append(element)
return _get_first_or_list(elements, first)
def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath:
"""Given an XPath selector, returns a list of
:class:`Element <Element>` objects or a single one.
:param selector: XPath Selector to use.
:param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
:param first: Whether or not to return just the first result.
:param _encoding: The encoding format.
If a sub-selector is specified (
requests-html
3星 · 超过75%的资源 需积分: 20 124 浏览量
2018-05-09
09:47:54
上传
评论 4
收藏 2.34MB ZIP 举报
weixin_41215722
- 粉丝: 0
- 资源: 1
最新资源
- libjpeg 编译所需的 Win32.mak vs编译libjpeg
- 自动驾驶-状态估计和定位-粒子滤波实现和源码.pdf
- 数据可视化-智慧物流服务中心大屏页面.zip
- yolov5,SSD 可能使用到的一些代码
- bbbbbbbbbbbbbbbbbb
- 安卓逆向学习笔记之Frida Stalker 还原OLLVM AES.docx
- 安卓逆向学习笔记之unicorn来trace还原OLLVM Base64.docx
- 基于jquery的自定义表格组件实现
- Nessus最新20240426离线安装插件all-2.0.tar.gz
- 最新版本私钥助记词碰撞器大富豪使用python进行制作通过接口的方式进行验证支持多币种多链多网络一分钟万次验证高出货率
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈