Fixed URI encoding of reserved characters (Issue #369).

Previously, util.requote_path would unquote and requote all characters,
causing reserved characters to become encoded (changing the semantics of the
URI). Now, it has special code for unquoting just the unreserved characters,
then quotes only illegal characters.
This ensures that illegal characters are fixed, and URIs are normalised, but
reserved characters do not erroneously become quoted.
Test case test_session_with_escaped_url now passes.
This commit is contained in:
Matt Giuca
2012-02-14 12:51:03 +11:00
parent c0763bb8d5
commit fcac1c3746
+27 -1
View File
@@ -396,6 +396,28 @@ def stream_decompress(iterator, mode='gzip'):
if rv:
yield rv
# The unreserved URI characters (RFC 3986)
UNRESERVED_SET = frozenset(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789-._~")
def unquote_unreserved(uri):
"""Un-escape any percent-escape sequences in a URI that are unreserved
characters.
This leaves all reserved, illegal and non-ASCII bytes encoded.
"""
parts = uri.split('%')
for i in range(1, len(parts)):
h = parts[i][0:2]
if len(h) == 2:
c = chr(int(h, 16))
if c in UNRESERVED_SET:
parts[i] = c + parts[i][2:]
else:
parts[i] = '%' + parts[i]
else:
parts[i] = '%' + parts[i]
return ''.join(parts)
def requote_path(path):
"""Re-quote the given URL path component.
@@ -404,5 +426,9 @@ def requote_path(path):
ensure that it is fully and consistently quoted.
"""
parts = path.split("/")
parts = (quote(unquote(part), safe="") for part in parts)
# Unquote only the unreserved characters
# Then quote only illegal characters (do not quote reserved, unreserved,
# or '%')
parts = (quote(unquote_unreserved(part), safe="!#$%&'()*+,/:;=?@[]~")
for part in parts)
return "/".join(parts)