mirror of
https://github.com/kennethreitz/requests.git
synced 2026-06-05 22:50:18 +00:00
move encoding methods into utils for external consumption
This commit is contained in:
@@ -9,7 +9,9 @@ that are also useful for external consumption.
|
||||
|
||||
"""
|
||||
|
||||
import cgi
|
||||
import cookielib
|
||||
import re
|
||||
|
||||
|
||||
def dict_from_cookiejar(cookiejar):
|
||||
@@ -70,3 +72,62 @@ def add_dict_to_cookiejar(cj, cookie_dict):
|
||||
cj.set_cookie(cookie)
|
||||
|
||||
return cj
|
||||
|
||||
|
||||
def get_encodings_from_content(content):
|
||||
"""Returns encodings from given content string."""
|
||||
|
||||
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
||||
|
||||
return charset_re.findall(content)
|
||||
|
||||
|
||||
|
||||
def get_encoding_from_headers(headers):
|
||||
"""Returns encodings from given HTTP Header Dict."""
|
||||
|
||||
content_type = headers.get('content-type')
|
||||
content_type, params = cgi.parse_header(content_type)
|
||||
|
||||
if 'charset' in params:
|
||||
return params['charset'].strip("'\"")
|
||||
|
||||
|
||||
def get_unicode_from_response(r):
|
||||
"""Returns the requested content back in unicode.
|
||||
|
||||
Tried:
|
||||
1. charset from content-type
|
||||
2. every encodings from <meta ... charset=XXX>
|
||||
3. fall back and replace all unicode characters
|
||||
"""
|
||||
|
||||
tried_encodings = []
|
||||
|
||||
# Try charset from content-type
|
||||
encoding = get_encoding_from_headers(r.headers)
|
||||
|
||||
if encoding:
|
||||
try:
|
||||
print '!'
|
||||
return unicode(r.content, encoding)
|
||||
except UnicodeError:
|
||||
tried_encodings.append(encoding)
|
||||
|
||||
# Try every encodings from <meta ... charset=XXX>
|
||||
encodings = get_encodings_from_content(r.content)
|
||||
|
||||
for encoding in encodings:
|
||||
if encoding in tried_encodings:
|
||||
continue
|
||||
try:
|
||||
|
||||
return unicode(r.content, encoding)
|
||||
except (UnicodeError, TypeError):
|
||||
tried_encodings.append(encoding)
|
||||
|
||||
# Fall back:
|
||||
try:
|
||||
return unicode(r.content, encoding, errors='replace')
|
||||
except TypeError:
|
||||
return r.content
|
||||
|
||||
Reference in New Issue
Block a user