diff --git a/requests_html.py b/requests_html.py index 9c14ff5..9c25288 100644 --- a/requests_html.py +++ b/requests_html.py @@ -11,9 +11,11 @@ import requests from pyquery import PyQuery from fake_useragent import UserAgent +from lxml.html.clean import Cleaner import lxml from lxml import etree from lxml.html import HtmlElement +from lxml.html import tostring as lxml_html_tostring from lxml.html.soupparser import fromstring as soup_parse from parse import search as parse_search from parse import findall, Result @@ -23,6 +25,10 @@ DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' +cleaner = Cleaner() +cleaner.javascript = True +cleaner.style = True + useragent = None # Typing. @@ -94,7 +100,11 @@ class BaseParser: return etree.tostring(self.element, encoding='unicode').strip() @html.setter - def html(self, html: bytes) -> None: + def html(self, html: str) -> None: + self._html = html.encode(self.encoding) + + @raw_html.setter + def raw_html(self, html: bytes) -> None: """Property setter for self.html.""" self._html = html @@ -195,11 +205,12 @@ class BaseParser: else: return url - def find(self, selector: str = "*", containing: _Containing = None, first: bool = False, _encoding: str = None) -> _Find: + def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: """Given a CSS Selector, returns a list of :class:`Element ` objects or a single one. :param selector: CSS Selector to use. + :param clean: Whether or not to sanitize the found HTML of ``