cleaning

Signed-off-by: Kenneth Reitz <me@kennethreitz.org>
2026-06-05 23:00:20 +00:00 · 2018-03-09 10:42:04 -05:00
parent 89c001a02e
commit 3a5a94eb85
1 changed files with 33 additions and 3 deletions
@@ -11,9 +11,11 @@ import requests
 from pyquery import PyQuery

 from fake_useragent import UserAgent
+from lxml.html.clean import Cleaner
 import lxml
 from lxml import etree
 from lxml.html import HtmlElement
+from lxml.html import tostring as lxml_html_tostring
 from lxml.html.soupparser import fromstring as soup_parse
 from parse import search as parse_search
 from parse import findall, Result
@@ -23,6 +25,10 @@ DEFAULT_ENCODING = 'utf-8'
 DEFAULT_URL = 'https://example.org/'
 DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'

+cleaner = Cleaner()
+cleaner.javascript = True
+cleaner.style = True
+
 useragent = None

 # Typing.
@@ -94,7 +100,11 @@ class BaseParser:
            return etree.tostring(self.element, encoding='unicode').strip()

    @html.setter
-    def html(self, html: bytes) -> None:
+    def html(self, html: str) -> None:
+        self._html = html.encode(self.encoding)
+
+    @raw_html.setter
+    def raw_html(self, html: bytes) -> None:
        """Property setter for self.html."""
        self._html = html

@@ -195,11 +205,12 @@ class BaseParser:
        else:
            return url

-    def find(self, selector: str = "*", containing: _Containing = None, first: bool = False, _encoding: str = None) -> _Find:
+    def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find:
        """Given a CSS Selector, returns a list of
        :class:`Element <Element>` objects or a single one.

        :param selector: CSS Selector to use.
+        :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
        :param containing: If specified, only return elements that contain the provided text.
        :param first: Whether or not to return just the first result.
        :param _encoding: The encoding format.
@@ -239,13 +250,23 @@ class BaseParser:

            elements.reverse()

+        # Sanitize the found HTML.
+        if clean:
+            elements_copy = elements.copy()
+            elements = []
+
+            for element in elements_copy:
+                element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
+                elements.append(element)
+
        return _get_first_or_list(elements, first)

-    def xpath(self, selector: str, first: bool = False, _encoding: str = None) -> _XPath:
+    def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath:
        """Given an XPath selector, returns a list of
        :class:`Element <Element>` objects or a single one.

        :param selector: XPath Selector to use.
+        :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
        :param first: Whether or not to return just the first result.
        :param _encoding: The encoding format.

@@ -267,6 +288,15 @@ class BaseParser:
            for selection in selected
        ]

+        # Sanitize the found HTML.
+        if clean:
+            elements_copy = elements.copy()
+            elements = []
+
+            for element in elements_copy:
+                element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
+                elements.append(element)
+
        return _get_first_or_list(elements, first)

    def search(self, template: str) -> Result: