diff --git a/Pipfile b/Pipfile index 2bf247a..6eec2da 100644 --- a/Pipfile +++ b/Pipfile @@ -1,12 +1,9 @@ [[source]] - url = "https://pypi.python.org/simple" verify_ssl = true name = "pypi" - [packages] - requests = "*" pyquery = "*" fake-useragent = "*" @@ -14,10 +11,9 @@ parse = "*" "bs4" = "*" "w3lib" = "*" pyppeteer = "*" - +"rfc3986" = "*" [dev-packages] - twine = "*" requests-file = "*" pytest = "*" @@ -27,7 +23,5 @@ mypy = "*" pytest-asyncio = "*" psutil = "*" - [scripts] - tests = "pytest -v -m ok" diff --git a/Pipfile.lock b/Pipfile.lock index cf5abb6..703da15 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e43fea1bf779c3be1ffe88493cd840abb13135df99429de6dce896323cb1bbd8" + "sha256": "cbd9bf4e0f09bac901c6a04bf80b38da6b0dece2d58aa5e6f1468b3cafbcc44d" }, "pipfile-spec": 6, "requires": {}, @@ -134,6 +134,14 @@ "index": "pypi", "version": "==2.18.4" }, + "rfc3986": { + "hashes": [ + "sha256:632b8fcd2ac37f24334316227f909be4f9d0738cbf409404cff6fa5f69a24093", + "sha256:8458571c4c57e1cf23593ad860bb601b6a604df6217f829c2bc70dc4b5af941b" + ], + "index": "pypi", + "version": "==1.1.0" + }, "six": { "hashes": [ "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", diff --git a/requests_html.py b/requests_html.py index 0807881..ea8f3bd 100644 --- a/requests_html.py +++ b/requests_html.py @@ -126,6 +126,12 @@ class BaseParser: # Scan meta tags for charset. if self._html: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] + # Fall back to requests' detected encoding if decode fails. + try: + self.raw_html.decode(self.encoding) + except UnicodeDecodeError: + self._encoding = self.default_encoding + return self._encoding if self._encoding else self.default_encoding