move encoding methods into utils for external consumption

This commit is contained in:
Kenneth Reitz
2011-08-20 19:17:41 -04:00
parent 7fc8c7ccfd
commit 8fbb1e6d97
+61
View File
@@ -9,7 +9,9 @@ that are also useful for external consumption.
"""
import cgi
import cookielib
import re
def dict_from_cookiejar(cookiejar):
@@ -70,3 +72,62 @@ def add_dict_to_cookiejar(cj, cookie_dict):
cj.set_cookie(cookie)
return cj
def get_encodings_from_content(content):
"""Returns encodings from given content string."""
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
return charset_re.findall(content)
def get_encoding_from_headers(headers):
"""Returns encodings from given HTTP Header Dict."""
content_type = headers.get('content-type')
content_type, params = cgi.parse_header(content_type)
if 'charset' in params:
return params['charset'].strip("'\"")
def get_unicode_from_response(r):
"""Returns the requested content back in unicode.
Tried:
1. charset from content-type
2. every encodings from <meta ... charset=XXX>
3. fall back and replace all unicode characters
"""
tried_encodings = []
# Try charset from content-type
encoding = get_encoding_from_headers(r.headers)
if encoding:
try:
print '!'
return unicode(r.content, encoding)
except UnicodeError:
tried_encodings.append(encoding)
# Try every encodings from <meta ... charset=XXX>
encodings = get_encodings_from_content(r.content)
for encoding in encodings:
if encoding in tried_encodings:
continue
try:
return unicode(r.content, encoding)
except (UnicodeError, TypeError):
tried_encodings.append(encoding)
# Fall back:
try:
return unicode(r.content, encoding, errors='replace')
except TypeError:
return r.content