Fixed URI encoding of reserved characters (Issue #369).

Previously, util.requote_path would unquote and requote all characters, causing reserved characters to become encoded (changing the semantics of the URI). Now, it has special code for unquoting just the unreserved characters, then quotes only illegal characters. This ensures that illegal characters are fixed, and URIs are normalised, but reserved characters do not erroneously become quoted. Test case test_session_with_escaped_url now passes.
2026-06-05 22:50:18 +00:00 · 2012-02-14 12:51:03 +11:00
parent c0763bb8d5
commit fcac1c3746
1 changed files with 27 additions and 1 deletions
@@ -396,6 +396,28 @@ def stream_decompress(iterator, mode='gzip'):
        if rv:
            yield rv

+# The unreserved URI characters (RFC 3986)
+UNRESERVED_SET = frozenset(
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    + "0123456789-._~")
+
+def unquote_unreserved(uri):
+    """Un-escape any percent-escape sequences in a URI that are unreserved
+    characters.
+    This leaves all reserved, illegal and non-ASCII bytes encoded.
+    """
+    parts = uri.split('%')
+    for i in range(1, len(parts)):
+        h = parts[i][0:2]
+        if len(h) == 2:
+            c = chr(int(h, 16))
+            if c in UNRESERVED_SET:
+                parts[i] = c + parts[i][2:]
+            else:
+                parts[i] = '%' + parts[i]
+        else:
+            parts[i] = '%' + parts[i]
+    return ''.join(parts)

 def requote_path(path):
    """Re-quote the given URL path component.
@@ -404,5 +426,9 @@ def requote_path(path):
    ensure that it is fully and consistently quoted.
    """
    parts = path.split("/")
-    parts = (quote(unquote(part), safe="") for part in parts)
+    # Unquote only the unreserved characters
+    # Then quote only illegal characters (do not quote reserved, unreserved,
+    # or '%')
+    parts = (quote(unquote_unreserved(part), safe="!#$%&'()*+,/:;=?@[]~")
+             for part in parts)
    return "/".join(parts)