From c21f0784cd32fad3a18ec18d2a0705b9b010a2b0 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Thu, 5 Apr 2018 13:47:39 +0200 Subject: [PATCH] Create LXML from raw_html Create LXML from `self.raw_html` instead of `self.html` to allow LXML to process plain XML pages as per beda42's findings in issue https://github.com/kennethreitz/requests-html/issues/145 I have tested this change with 200 sites and it seems to fix the issue. HTML pages seem to all be working as expected. I haven't run into an issue with any that I've tested. --- requests_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requests_html.py b/requests_html.py index ea8f3bd..0c3a989 100644 --- a/requests_html.py +++ b/requests_html.py @@ -159,7 +159,7 @@ class BaseParser: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: - self._lxml = lxml.html.fromstring(self.html) + self._lxml = lxml.html.fromstring(self.raw_html) return self._lxml