From 4decc7986e32bb8f3511df3dd0c9b1c1d57453c1 Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Thu, 25 Oct 2012 17:43:52 +0200
Subject: [PATCH 1/6] Use a JSON-specific encoding detection when no encoding
 has been specified.

JSON *must* be encoded using UTF-8, UTF-16 or UTF-32 (see the [RFC][1]; detect the encoding based on the fact that JSON always starts with 2 ASCII characters.

[1]: http://tools.ietf.org/html/rfc4627#section-3
---
 requests/models.py  | 15 ++++++++++++-
 requests/utils.py   | 35 ++++++++++++++++++++++++++++++
 tests/test_utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_utils.py

diff --git a/requests/models.py b/requests/models.py
index c19d3cca..06c8a71a 100644
--- a/requests/models.py
+++ b/requests/models.py
@@ -31,7 +31,8 @@ from .exceptions import (
 from .utils import (
     get_encoding_from_headers, stream_untransfer, guess_filename, requote_uri,
     stream_decode_response_unicode, get_netrc_auth, get_environ_proxies,
-    to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices)
+    to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices,
+    guess_json_utf)
 from .compat import (
     cookielib, urlparse, urlunparse, urljoin, urlsplit, urlencode, str, bytes,
     StringIO, is_py2, chardet, json, builtin_str)
@@ -842,6 +843,18 @@ class Response(object):
     @property
     def json(self):
         """Returns the json-encoded content of a response, if any."""
+
+        if not self.encoding and len(self.content) > 3:
+            # No encoding set. JSON RFC 4627 section 3 states we should expect
+            # UTF-8, -16 or -32. Detect which one to use; If the detection or
+            # decoding fails, fall back to `self.text` (using chardet to make
+            # a best guess).
+            encoding = guess_json_utf(self.content)
+            if encoding is not None:
+                try:
+                    return json.loads(self.content.decode(encoding))
+                except (ValueError, UnicodeDecodeError):
+                    pass
         try:
             return json.loads(self.text or self.content)
         except ValueError:
diff --git a/requests/utils.py b/requests/utils.py
index 7e9f6315..ec9f4d2c 100644
--- a/requests/utils.py
+++ b/requests/utils.py
@@ -579,3 +579,38 @@ def parse_header_links(value):
         links.append(link)
 
     return links
+
+
+# Null bytes; no need to recreate these on each call to guess_json_utf
+_null = '\x00'.encode('ascii')  # encoding to ASCII for Python 3
+_null2 = _null * 2
+_null3 = _null * 3
+
+
+def guess_json_utf(data):
+    # JSON always starts with two ASCII characters, so detection is as
+    # easy as counting the nulls and from their location and count
+    # determine the encoding. Also detect a BOM, if present.
+    sample = data[:4]
+    if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE):
+        return 'utf-32'     # BOM included
+    if sample[:3] == codecs.BOM_UTF8:
+        return 'utf-8-sig'  # BOM included, MS style (discouraged)
+    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
+        return 'utf-16'     # BOM included
+    nullcount = sample.count(_null)
+    if nullcount == 0:
+        return 'utf-8'
+    if nullcount == 2:
+        if sample[::2] == _null2:   # 1st and 3rd are null
+            return 'utf-16-be'
+        if sample[1::2] == _null2:  # 2nd and 4th are null
+            return 'utf-16-le'
+        # Did not detect 2 valid UTF-16 ascii-range characters
+    if nullcount == 3:
+        if sample[:3] == _null3:
+            return 'utf-32-be'
+        if sample[1:] == _null3:
+            return 'utf-32-le'
+        # Did not detect a valid UTF-32 ascii-range character
+    return None
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..27fa18e4
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import unittest
+import random
+
+# Path hack.
+sys.path.insert(0, os.path.abspath('..'))
+import requests.utils
+from requests.compat import is_py3, bytes
+
+
+class GuessJSONUTFTests(unittest.TestCase):
+    """Smoke test for https functionality."""
+
+    codecs = (
+        'utf-8', 'utf-8-sig',
+        'utf-16', 'utf-16-le', 'utf-16-be',
+        'utf-32', 'utf-32-le', 'utf-32-be'
+    )
+
+    def test_guess_encoding(self):
+        # Throw 4-character ASCII strings (encoded to a UTF encoding)
+        # at the guess routine; it should correctly guess all codecs.
+        unichr = chr if is_py3 else __builtins__.unichr
+        guess = requests.utils.guess_json_utf
+        for c in range(33, 127):  # printable only
+            sample = unichr(c) * 4
+            for codec in self.codecs:
+                res = guess(sample.encode(codec))
+                self.assertEqual(res, codec)
+
+    def test_smoke_encoding(self):
+        # Throw random 4-byte strings at the guess function.
+        # Any guess for a UTF encoding is verified, a decode exception
+        # is a test failure.
+        chr = (lambda c: bytes([c])) if is_py3 else __builtins__.chr
+        guess = requests.utils.guess_json_utf
+        for i in range(1000):
+            sample = bytes().join(
+                [chr(random.randrange(256)) for _ in range(4)])
+            res = guess(sample)
+            if res is not None and res != 'utf-8':
+                # This should decode without errors if this is *really*
+                # something in this encoding. Skip UTF-8, it is more
+                # picky about valid data.
+                sample.decode(res)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9832bd89172fb1200e8fdfb2e1cc334ad0fcf332 Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Thu, 25 Oct 2012 17:56:19 +0200
Subject: [PATCH 2/6] Correct a c&p mistake: set a correct docstring for the
 unit test class.

---
 tests/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 27fa18e4..dabb579e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -13,7 +13,7 @@ from requests.compat import is_py3, bytes
 
 
 class GuessJSONUTFTests(unittest.TestCase):
-    """Smoke test for https functionality."""
+    """Tests for the JSON UTF encoding guessing code."""
 
     codecs = (
         'utf-8', 'utf-8-sig',

From 25f2806f23d2a88920a3558f8bc4f798a115d014 Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Thu, 25 Oct 2012 18:12:55 +0200
Subject: [PATCH 3/6] Cheek: insert myself into AUTHORS.rst.

---
 AUTHORS.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS.rst b/AUTHORS.rst
index 9a7f25d5..3c074d1b 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -115,3 +115,4 @@ Patches and Suggestions
 - Rhys Elsmore
 - André Graf (dergraf)
 - Stephen Zhuang (everbird)
+- Martijn Pieters

From a4be9a2578dcb8f0862cc80f1e37243edfc04bd7 Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Thu, 25 Oct 2012 18:22:07 +0200
Subject: [PATCH 4/6] Redefine the `unichr` and bytes-variant of `chr` at
 module level.

Needed to appease Travis; it's python 2.6 and 2.7 builds are weird and the `__builtins__` dict is not following CPython conventions.
---
 tests/test_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index dabb579e..1122fb37 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,6 +12,11 @@ import requests.utils
 from requests.compat import is_py3, bytes
 
 
+if is_py3:
+    unichr = chr
+    chr = lambda c: bytes([c])
+
+
 class GuessJSONUTFTests(unittest.TestCase):
     """Tests for the JSON UTF encoding guessing code."""
 
@@ -24,7 +29,6 @@ class GuessJSONUTFTests(unittest.TestCase):
     def test_guess_encoding(self):
         # Throw 4-character ASCII strings (encoded to a UTF encoding)
         # at the guess routine; it should correctly guess all codecs.
-        unichr = chr if is_py3 else __builtins__.unichr
         guess = requests.utils.guess_json_utf
         for c in range(33, 127):  # printable only
             sample = unichr(c) * 4
@@ -36,7 +40,6 @@ class GuessJSONUTFTests(unittest.TestCase):
         # Throw random 4-byte strings at the guess function.
         # Any guess for a UTF encoding is verified, a decode exception
         # is a test failure.
-        chr = (lambda c: bytes([c])) if is_py3 else __builtins__.chr
         guess = requests.utils.guess_json_utf
         for i in range(1000):
             sample = bytes().join(

From be01a35ef12c7e71c0e71c4e37d1f1a392a66fd8 Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Thu, 25 Oct 2012 18:27:21 +0200
Subject: [PATCH 5/6] Better not call it `chr`, rename to `byteschr`.

---
 tests/test_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 1122fb37..c0560ec1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,7 +14,9 @@ from requests.compat import is_py3, bytes
 
 if is_py3:
     unichr = chr
-    chr = lambda c: bytes([c])
+    byteschr = lambda c: bytes([c])
+else:
+    byteschr = chr
 
 
 class GuessJSONUTFTests(unittest.TestCase):
@@ -43,7 +45,7 @@ class GuessJSONUTFTests(unittest.TestCase):
         guess = requests.utils.guess_json_utf
         for i in range(1000):
             sample = bytes().join(
-                [chr(random.randrange(256)) for _ in range(4)])
+                [byteschr(random.randrange(256)) for _ in range(4)])
             res = guess(sample)
             if res is not None and res != 'utf-8':
                 # This should decode without errors if this is *really*

From e26ccb34eb1c0d3948bfd9e50ffe333605ae554d Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Fri, 26 Oct 2012 12:15:27 +0200
Subject: [PATCH 6/6] Fix the smoke test in the face of UTF-16 surrogate pairs.

If the random data starts with a UTF-16 BOM *and* the next two bytes are for a character in the `\ud800`-`\udfff` range decoding would fail. Small chance, but still possible.

Extend it to check the UTF-8 error as well. The goal is to test that the guesser was *mostly* correct, and to verify the cases where it wasn't that it was to be expected. Most of all that the function doesn't buckle under wildly unexpected data.
---
 tests/test_utils.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index c0560ec1..5cd0684e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import codecs
 import os
 import sys
 import unittest
@@ -47,12 +48,30 @@ class GuessJSONUTFTests(unittest.TestCase):
             sample = bytes().join(
                 [byteschr(random.randrange(256)) for _ in range(4)])
             res = guess(sample)
-            if res is not None and res != 'utf-8':
+            if res is not None:
                 # This should decode without errors if this is *really*
-                # something in this encoding. Skip UTF-8, it is more
-                # picky about valid data.
-                sample.decode(res)
-
+                # something in this encoding. However, UTF-8 is a lot
+                # more picky, so we expect errors there. UTF-16 surrogate
+                # pairs also fail
+                try:
+                    sample.decode(res)
+                except UnicodeDecodeError as e:
+                    self.assertEqual(e.args[0].replace('-', '').lower(),
+                                     res.replace('-', '').lower())
+                    if res == 'utf-8':
+                        self.assertTrue(e.args[-1], (
+                            'invalid continuation byte',
+                            'invalid start byte'))
+                        continue
+                    if res == 'utf-16':
+                        self.assertEqual(e.args[-1], 'unexpected end of data')
+                        self.assertTrue(sample[:2] in (
+                            codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE))
+                        # the second two bytes are in the range \ud800-\udfff
+                        # if someone wants to add tests for that as well. I don't
+                        # see the need; we are not testing UTF decoding here.
+                        continue
+                    raise
 
 if __name__ == '__main__':
     unittest.main()