From c21f0784cd32fad3a18ec18d2a0705b9b010a2b0 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Thu, 5 Apr 2018 13:47:39 +0200 Subject: [PATCH 1/2] Create LXML from raw_html Create LXML from `self.raw_html` instead of `self.html` to allow LXML to process plain XML pages as per beda42's findings in issue https://github.com/kennethreitz/requests-html/issues/145 I have tested this change with 200 sites and it seems to fix the issue. HTML pages seem to all be working as expected. I haven't run into an issue with any that I've tested. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..0c3a989 100644 --- a/requests_html.py +++ b/requests_html.py @@ -159,7 +159,7 @@ class BaseParser: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: - self._lxml = lxml.html.fromstring(self.html) + self._lxml = lxml.html.fromstring(self.raw_html) return self._lxml From 2a7d08722d9d28c23efac63e6191385f093aa38c Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Sat, 14 Apr 2018 21:32:00 +0200 Subject: [PATCH 2/2] Initialize PyQuery with lxml PyQuery with XML sites also has the same issue that LXML does with unicode encoded strings because it uses LXML to parse the page. The fix has already been applied to LXML, so we can fix the issue with PyQuery by passing the already parsed LXML into PyQuery. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index 0c3a989..4c8a59f 100644 --- a/requests_html.py +++ b/requests_html.py @@ -146,7 +146,7 @@ class BaseParser: of the :class:`Element ` or :class:`HTML `. """ if self._pq is None: - self._pq = PyQuery(self.html) + self._pq = PyQuery(self.lxml) return self._pq