From 8af172c5cefee5d9e2826d3bcbae87ff20aa5430 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Sat, 3 Mar 2018 08:17:42 -0500 Subject: [PATCH] vast improvements Signed-off-by: Kenneth Reitz --- requests_html.py | 17 +++++++++++++---- tests/test_requests_html.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/requests_html.py b/requests_html.py index e5e9241..41047eb 100644 --- a/requests_html.py +++ b/requests_html.py @@ -7,6 +7,7 @@ from typing import Set, Union, List, MutableMapping, Optional import pyppeteer import requests from pyquery import PyQuery +from pyquery.pyquery import fromstring from fake_useragent import UserAgent from lxml import etree @@ -64,6 +65,8 @@ class BaseParser: self.default_encoding = default_encoding self._encoding = None self._html = html.encode(DEFAULT_ENCODING) if isinstance(html, str) else html + self._lxml = None + self._pq = None @property def raw_html(self) -> _RawHTML: @@ -81,7 +84,7 @@ class BaseParser: (`learn more `_). """ if self._html: - return self._html.decode(self.encoding) + return self.raw_html.decode(self.encoding) else: return etree.tostring(self.element, encoding='unicode').strip() @@ -114,14 +117,20 @@ class BaseParser: """`PyQuery `_ representation of the :class:`Element ` or :class:`HTML `. """ - return PyQuery(self.element) + if self._pq is None: + self._pq = PyQuery(self.html) + + return self._pq @property def lxml(self) -> HtmlElement: """`lxml `_ representation of the :class:`Element ` or :class:`HTML `. """ - return soup_parse(self.html, features='html.parser') + if self._lxml is None: + self._lxml = soup_parse(self.html, features='html.parser') + + return self._lxml @property def text(self) -> _Text: @@ -291,7 +300,7 @@ class Element(BaseParser): """Returns a dictionary of the attributes of the :class:`Element ` (`learn more `_). """ - attrs = {k: self.pq.attr[k].strip() for k in self.element.keys()} + attrs = {k: v for k, v in self.element.items()} # Split class up, as there are ussually many of them: if 'class' in attrs: diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 930837c..d0f34ee 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -151,4 +151,4 @@ def test_bare_js_eval(): if __name__ == '__main__': - test_xpath() + test_anchor_links()