From 8f30583bcc6feb390adbeda469ff4b8d0b580596 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Mon, 26 Feb 2018 18:14:50 -0500 Subject: [PATCH] much better encoding detection Signed-off-by: Kenneth Reitz --- Pipfile | 3 ++- Pipfile.lock | 16 +++++++++++++++- requests_html.py | 15 ++------------- setup.py | 2 +- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/Pipfile b/Pipfile index d14dd0c..2aca609 100644 --- a/Pipfile +++ b/Pipfile @@ -12,7 +12,8 @@ pyquery = "*" fake-useragent = "*" parse = "*" "bs4" = "*" -"PyQt5" = "*" +"pyqt5" = "*" +"w3lib" = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 5a4da27..d150e86 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f1c86a7d3114319829cb5558a7a21eeefce9dcc0f7f3869ef57dc2f5eab042fd" + "sha256": "c88fa0d21e44545a4b9ce13ff463f2da284a846029b315dbb8757057b70166a4" }, "host-environment-markers": { "implementation_name": "cpython", @@ -154,12 +154,26 @@ ], "version": "==4.19.7" }, + "six": { + "hashes": [ + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb", + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9" + ], + "version": "==1.11.0" + }, "urllib3": { "hashes": [ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" ], "version": "==1.22" + }, + "w3lib": { + "hashes": [ + "sha256:aaf7362464532b1036ab0092e2eee78e8fd7b56787baa9ed4967457b083d011b", + "sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38" + ], + "version": "==1.19.0" } }, "develop": { diff --git a/requests_html.py b/requests_html.py index dfea366..70cbb61 100644 --- a/requests_html.py +++ b/requests_html.py @@ -1,4 +1,3 @@ -import sys from urllib.parse import urlparse, urlunparse import requests @@ -9,6 +8,7 @@ from lxml import etree from lxml.html.soupparser import fromstring from parse import search as parse_search from parse import findall +from w3lib.encoding import html_to_unicode try: from PyQt5.QtWidgets import QApplication @@ -50,18 +50,7 @@ class BaseParser: return self._encoding # Scan meta tags for chaset. - for meta_tag in self.find('meta', _encoding=self.default_encoding): - - # HTML 5 support. - if 'charset' in meta_tag.attrs: - self._encoding = meta_tag.attrs['charset'] - - # HTML 4 support. - if 'content' in meta_tag.attrs: - try: - self._encoding = meta_tag.attrs['content'].split('charset=')[1] - except IndexError: - pass + self._encoding = html_to_unicode(self.default_encoding, self.html)[0] return self._encoding if self._encoding else self.default_encoding diff --git a/setup.py b/setup.py index 91e781a..22b4b38 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ VERSION = '0.3.2' # What packages are required for this module to be executed? REQUIRED = [ - 'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4' + 'requests', 'pyquery', 'fake-useragent', 'parse', 'bs4', 'w3lib' ] # The rest you shouldn't have to touch too much :)