From 636e61d68b9176c614f8243295c81552dd0ee199 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Tue, 23 Aug 2011 23:09:09 -0400 Subject: [PATCH] utils.unicode_from_html --- requests/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/requests/utils.py b/requests/utils.py index 8a419acb..35eb005b 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -110,6 +110,25 @@ def get_encoding_from_headers(headers): return params['charset'].strip("'\"") +def unicode_from_html(content): + """Attempts to decode an HTML string into unicode. + If unsuccessful, the original content is returned. + """ + + encodings = get_encodings_from_content(content) + + for encoding in encodings: + + try: + return unicode(content, encoding) + except (UnicodeError, TypeError): + pass + + return content + + + + def get_unicode_from_response(r): """Returns the requested content back in unicode.