requests-html_htmlimport资源-CSDN文库

共21个文件

py：5个

html：4个

rst：2个

requests

html

3星 · 超过75%的资源需积分: 20 124 浏览量 2018-05-09 09:47:54 上传评论 4 收藏 2.34MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

requests-html-master.zip （21个子文件）

requests-html-master

.travis.yml 380B

README.rst 14KB

docs

make.bat 821B

Makefile 614B

source

conf.py 6KB

_templates

sidebarlogo.html 2KB

sidebarintro.html 2KB

hacks.html 1KB

index.rst 17KB

_static

requests-html-logo.png 281KB

Pipfile 424B

tests

python.html 48KB

test_internet.py 332B

test_requests_html.py 6KB

requests_html.py 25KB

LICENSE 1KB

Pipfile.lock 28KB

ext

requests-html-logo.ai 2.77MB

setup.py 3KB

.gitignore 1KB

Makefile 147B

import sys import asyncio from urllib.parse import urlparse, urlunparse, urljoin from concurrent.futures import ThreadPoolExecutor from concurrent.futures._base import TimeoutError from functools import partial from typing import Set, Union, List, MutableMapping, Optional import pyppeteer import requests from pyquery import PyQuery from fake_useragent import UserAgent from lxml.html.clean import Cleaner import lxml from lxml import etree from lxml.html import HtmlElement from lxml.html import tostring as lxml_html_tostring from lxml.html.soupparser import fromstring as soup_parse from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older'] cleaner = Cleaner() cleaner.javascript = True cleaner.style = True useragent = None # Typing. _Find = Union[List['Element'], 'Element'] _XPath = Union[List[str], List['Element'], str, 'Element'] _Result = Union[List['Result'], 'Result'] _HTML = Union[str, bytes] _BaseHTML = str _UserAgent = str _DefaultEncoding = str _URL = str _RawHTML = bytes _Encoding = str _LXML = HtmlElement _Text = str _Search = Result _Containing = Union[str, List[str]] _Links = Set[str] _Attrs = MutableMapping _Next = Union['HTML', List[str]] _NextSymbol = List[str] # Sanity checking. try: assert sys.version_info.major == 3 assert sys.version_info.minor > 5 except AssertionError: raise RuntimeError('Requests-HTML requires Python 3.6+!') class MaxRetries(Exception): def __init__(self, message): self.message = message class BaseParser: """A basic HTML/Element Parser, for Humans. :param element: The element from which to base the parsing upon. :param default_encoding: Which encoding to default to. :param html: HTML from which to base the parsing upon (optional). :param url: The URL from which the HTML originated, used for ``absolute_links``. """ def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: self.element = element self.url = url self.skip_anchors = True self.default_encoding = default_encoding self._encoding = None self._html = html.encode(DEFAULT_ENCODING) if isinstance(html, str) else html self._lxml = None self._pq = None @property def raw_html(self) -> _RawHTML: """Bytes representation of the HTML content. (`learn more <http://www.diveintopython3.net/strings.html>`_). """ if self._html: return self._html else: return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding) @property def html(self) -> _BaseHTML: """Unicode representation of the HTML content (`learn more <http://www.diveintopython3.net/strings.html>`_). """ if self._html: return self.raw_html.decode(self.encoding) else: return etree.tostring(self.element, encoding='unicode').strip() @html.setter def html(self, html: str) -> None: self._html = html.encode(self.encoding) @raw_html.setter def raw_html(self, html: bytes) -> None: """Property setter for self.html.""" self._html = html @property def encoding(self) -> _Encoding: """The encoding string to be used, extracted from the HTML and :class:`HTMLResponse <HTMLResponse>` headers. """ if self._encoding: return self._encoding # Scan meta tags for charset. if self._html: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] # Fall back to requests' detected encoding if decode fails. try: self.raw_html.decode(self.encoding) except UnicodeDecodeError: self._encoding = self.default_encoding return self._encoding if self._encoding else self.default_encoding @encoding.setter def encoding(self, enc: str) -> None: """Property setter for self.encoding.""" self._encoding = enc @property def pq(self) -> PyQuery: """`PyQuery <https://pythonhosted.org/pyquery/>`_ representation of the :class:`Element <Element>` or :class:`HTML <HTML>`. """ if self._pq is None: self._pq = PyQuery(self.html) return self._pq @property def lxml(self) -> HtmlElement: """`lxml <http://lxml.de>`_ representation of the :class:`Element <Element>` or :class:`HTML <HTML>`. """ if self._lxml is None: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: self._lxml = lxml.html.fromstring(self.html) return self._lxml @property def text(self) -> _Text: """The text content of the :class:`Element <Element>` or :class:`HTML <HTML>`. """ return self.pq.text() @property def full_text(self) -> _Text: """The full text content (including links) of the :class:`Element <Element>` or :class:`HTML <HTML>`. """ return self.lxml.text_content() def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: """Given a CSS Selector, returns a list of :class:`Element <Element>` objects or a single one. :param selector: CSS Selector to use. :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags. :param containing: If specified, only return elements that contain the provided text. :param first: Whether or not to return just the first result. :param _encoding: The encoding format. Example CSS Selectors: - ``a`` - ``a.someClass`` - ``a#someID`` - ``a[target=_blank]`` See W3School's `CSS Selectors Reference <https://www.w3schools.com/cssref/css_selectors.asp>`_ for more details. If ``first`` is ``True``, only returns the first :class:`Element <Element>` found. """ # Convert a single containing into a list. if isinstance(containing, str): containing = [containing] encoding = _encoding or self.encoding elements = [ Element(element=found, url=self.url, default_encoding=encoding) for found in self.pq(selector) ] if containing: elements_copy = elements.copy() elements = [] for element in elements_copy: if any([c.lower() in element.full_text.lower() for c in containing]): elements.append(element) elements.reverse() # Sanitize the found HTML. if clean: elements_copy = elements.copy() elements = [] for element in elements_copy: element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml)) elements.append(element) return _get_first_or_list(elements, first) def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath: """Given an XPath selector, returns a list of :class:`Element <Element>` objects or a single one. :param selector: XPath Selector to use. :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags. :param first: Whether or not to return just the first result. :param _encoding: The encoding format. If a sub-selector is specified (

评论收藏

内容反馈