From 8fbb1e6d97cda90d588d4263a18906a52d147fba Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Sat, 20 Aug 2011 19:17:41 -0400 Subject: [PATCH] move encoding methods into utils for external consumption --- requests/utils.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/requests/utils.py b/requests/utils.py index 8ac78b4e..72705734 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -9,7 +9,9 @@ that are also useful for external consumption. """ +import cgi import cookielib +import re def dict_from_cookiejar(cookiejar): @@ -70,3 +72,62 @@ def add_dict_to_cookiejar(cj, cookie_dict): cj.set_cookie(cookie) return cj + + +def get_encodings_from_content(content): + """Returns encodings from given content string.""" + + charset_re = re.compile(r']', flags=re.I) + + return charset_re.findall(content) + + + +def get_encoding_from_headers(headers): + """Returns encodings from given HTTP Header Dict.""" + + content_type = headers.get('content-type') + content_type, params = cgi.parse_header(content_type) + + if 'charset' in params: + return params['charset'].strip("'\"") + + +def get_unicode_from_response(r): + """Returns the requested content back in unicode. + + Tried: + 1. charset from content-type + 2. every encodings from + 3. fall back and replace all unicode characters + """ + + tried_encodings = [] + + # Try charset from content-type + encoding = get_encoding_from_headers(r.headers) + + if encoding: + try: + print '!' + return unicode(r.content, encoding) + except UnicodeError: + tried_encodings.append(encoding) + + # Try every encodings from + encodings = get_encodings_from_content(r.content) + + for encoding in encodings: + if encoding in tried_encodings: + continue + try: + + return unicode(r.content, encoding) + except (UnicodeError, TypeError): + tried_encodings.append(encoding) + + # Fall back: + try: + return unicode(r.content, encoding, errors='replace') + except TypeError: + return r.content