mirror of
https://github.com/kennethreitz/pipenv.git
synced 2026-06-05 22:50:18 +00:00
Merge pull request #3176 from frostming/non-ascii-toml
Fix non-ASCII support for prettytoml
This commit is contained in:
@@ -0,0 +1 @@
|
||||
Handle non-ASCII characters correctly in TOML.
|
||||
@@ -2,6 +2,7 @@
|
||||
"""
|
||||
A converter of python values to TOML Token instances.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import codecs
|
||||
import datetime
|
||||
import six
|
||||
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
|
||||
|
||||
|
||||
def _escape_single_line_quoted_string(text):
|
||||
if six.PY2:
|
||||
return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
|
||||
else:
|
||||
return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
|
||||
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
|
||||
start = 0
|
||||
i = 0
|
||||
res = []
|
||||
_escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
|
||||
'\b': '\\b', '\f': '\\f', '"': '\\"'}
|
||||
|
||||
def flush():
|
||||
if start < i:
|
||||
res.append(text[start:i])
|
||||
return i + 1
|
||||
|
||||
while i < len(text):
|
||||
c = text[i]
|
||||
if c in _escapes:
|
||||
start = flush()
|
||||
res.append(_escapes[c])
|
||||
elif ord(c) < 0x20:
|
||||
start = flush()
|
||||
res.append('\\u%04x' % ord(c))
|
||||
i += 1
|
||||
|
||||
flush()
|
||||
return ''.join(res)
|
||||
|
||||
|
||||
def _create_multiline_string_token(text):
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
import string
|
||||
import iso8601
|
||||
@@ -19,7 +20,7 @@ def deserialize(token):
|
||||
|
||||
Raises DeserializationError when appropriate.
|
||||
"""
|
||||
|
||||
|
||||
if token.type == TYPE_BOOLEAN:
|
||||
return _to_boolean(token)
|
||||
elif token.type == TYPE_INTEGER:
|
||||
@@ -39,42 +40,40 @@ def _unescape_str(text):
|
||||
"""
|
||||
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
|
||||
"""
|
||||
|
||||
# Detect bad escape jobs
|
||||
bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
|
||||
if bad_escape_regexp.findall(text):
|
||||
raise BadEscapeCharacter
|
||||
|
||||
# Do the unescaping
|
||||
if six.PY2:
|
||||
return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
|
||||
else:
|
||||
return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
|
||||
|
||||
|
||||
def _unicode_escaped_string(text):
|
||||
"""
|
||||
Escapes all unicode characters in the given string
|
||||
"""
|
||||
|
||||
if six.PY2:
|
||||
text = unicode(text)
|
||||
|
||||
def is_unicode(c):
|
||||
return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
|
||||
|
||||
def escape_unicode_char(x):
|
||||
if six.PY2:
|
||||
return x.encode('unicode-escape')
|
||||
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
|
||||
tokens = []
|
||||
i = 0
|
||||
basicstr_re = re.compile(r'[^"\\\000-\037]*')
|
||||
unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
|
||||
escapes = {
|
||||
'b': '\b',
|
||||
't': '\t',
|
||||
'n': '\n',
|
||||
'f': '\f',
|
||||
'r': '\r',
|
||||
'\\': '\\',
|
||||
'"': '"',
|
||||
'/': '/',
|
||||
"'": "'"
|
||||
}
|
||||
while True:
|
||||
m = basicstr_re.match(text, i)
|
||||
i = m.end()
|
||||
tokens.append(m.group())
|
||||
if i == len(text) or text[i] != '\\':
|
||||
break
|
||||
else:
|
||||
return codecs.encode(x, 'unicode-escape')
|
||||
|
||||
if any(is_unicode(c) for c in text):
|
||||
homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
|
||||
homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
|
||||
return homogeneous_bytes.decode()
|
||||
else:
|
||||
return text
|
||||
i += 1
|
||||
if unicode_re.match(text, i):
|
||||
m = unicode_re.match(text, i)
|
||||
i = m.end()
|
||||
tokens.append(six.unichr(int(m.group(1), 16)))
|
||||
else:
|
||||
if text[i] not in escapes:
|
||||
raise BadEscapeCharacter
|
||||
tokens.append(escapes[text[i]])
|
||||
i += 1
|
||||
return ''.join(tokens)
|
||||
|
||||
|
||||
def _to_string(token):
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
|
||||
index 8299195..2decd02 100644
|
||||
--- a/pipenv/patched/prettytoml/tokens/py2toml.py
|
||||
+++ b/pipenv/patched/prettytoml/tokens/py2toml.py
|
||||
@@ -2,6 +2,7 @@
|
||||
"""
|
||||
A converter of python values to TOML Token instances.
|
||||
"""
|
||||
+from __future__ import unicode_literals
|
||||
import codecs
|
||||
import datetime
|
||||
import six
|
||||
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
|
||||
|
||||
|
||||
def _escape_single_line_quoted_string(text):
|
||||
- if six.PY2:
|
||||
- return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
|
||||
- else:
|
||||
- return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
|
||||
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
|
||||
+ start = 0
|
||||
+ i = 0
|
||||
+ res = []
|
||||
+ _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
|
||||
+ '\b': '\\b', '\f': '\\f', '"': '\\"'}
|
||||
+
|
||||
+ def flush():
|
||||
+ if start < i:
|
||||
+ res.append(text[start:i])
|
||||
+ return i + 1
|
||||
+
|
||||
+ while i < len(text):
|
||||
+ c = text[i]
|
||||
+ if c in _escapes:
|
||||
+ start = flush()
|
||||
+ res.append(_escapes[c])
|
||||
+ elif ord(c) < 0x20:
|
||||
+ start = flush()
|
||||
+ res.append('\\u%04x' % ord(c))
|
||||
+ i += 1
|
||||
+
|
||||
+ flush()
|
||||
+ return ''.join(res)
|
||||
|
||||
|
||||
def _create_multiline_string_token(text):
|
||||
diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
|
||||
index 2bf9c1c..5680443 100644
|
||||
--- a/pipenv/patched/prettytoml/tokens/toml2py.py
|
||||
+++ b/pipenv/patched/prettytoml/tokens/toml2py.py
|
||||
@@ -1,3 +1,4 @@
|
||||
+from __future__ import unicode_literals
|
||||
import re
|
||||
import string
|
||||
import iso8601
|
||||
@@ -39,42 +40,40 @@ def _unescape_str(text):
|
||||
"""
|
||||
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
|
||||
"""
|
||||
-
|
||||
- # Detect bad escape jobs
|
||||
- bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
|
||||
- if bad_escape_regexp.findall(text):
|
||||
- raise BadEscapeCharacter
|
||||
-
|
||||
- # Do the unescaping
|
||||
- if six.PY2:
|
||||
- return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
|
||||
- else:
|
||||
- return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
|
||||
-
|
||||
-
|
||||
-def _unicode_escaped_string(text):
|
||||
- """
|
||||
- Escapes all unicode characters in the given string
|
||||
- """
|
||||
-
|
||||
- if six.PY2:
|
||||
- text = unicode(text)
|
||||
-
|
||||
- def is_unicode(c):
|
||||
- return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
|
||||
-
|
||||
- def escape_unicode_char(x):
|
||||
- if six.PY2:
|
||||
- return x.encode('unicode-escape')
|
||||
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
|
||||
+ tokens = []
|
||||
+ i = 0
|
||||
+ basicstr_re = re.compile(r'[^"\\\000-\037]*')
|
||||
+ unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
|
||||
+ escapes = {
|
||||
+ 'b': '\b',
|
||||
+ 't': '\t',
|
||||
+ 'n': '\n',
|
||||
+ 'f': '\f',
|
||||
+ 'r': '\r',
|
||||
+ '\\': '\\',
|
||||
+ '"': '"',
|
||||
+ '/': '/',
|
||||
+ "'": "'"
|
||||
+ }
|
||||
+ while True:
|
||||
+ m = basicstr_re.match(text, i)
|
||||
+ i = m.end()
|
||||
+ tokens.append(m.group())
|
||||
+ if i == len(text) or text[i] != '\\':
|
||||
+ break
|
||||
else:
|
||||
- return codecs.encode(x, 'unicode-escape')
|
||||
-
|
||||
- if any(is_unicode(c) for c in text):
|
||||
- homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
|
||||
- homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
|
||||
- return homogeneous_bytes.decode()
|
||||
- else:
|
||||
- return text
|
||||
+ i += 1
|
||||
+ if unicode_re.match(text, i):
|
||||
+ m = unicode_re.match(text, i)
|
||||
+ i = m.end()
|
||||
+ tokens.append(six.unichr(int(m.group(1), 16)))
|
||||
+ else:
|
||||
+ if text[i] not in escapes:
|
||||
+ raise BadEscapeCharacter
|
||||
+ tokens.append(escapes[text[i]])
|
||||
+ i += 1
|
||||
+ return ''.join(tokens)
|
||||
|
||||
|
||||
def _to_string(token):
|
||||
@@ -1,3 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# We need to import the patched packages directly from sys.path, so the
|
||||
# identity checks can pass.
|
||||
import pipenv # noqa
|
||||
@@ -8,6 +9,7 @@ import os
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
import contoml
|
||||
from pipfile.api import PipfileParser
|
||||
from prettytoml import lexer, tokens
|
||||
from prettytoml.elements.atomic import AtomicElement
|
||||
@@ -104,3 +106,9 @@ class TestPipfileParser:
|
||||
def test_token_date(dt, content):
|
||||
token = create_primitive_token(dt)
|
||||
assert token == tokens.Token(tokens.TYPE_DATE, content)
|
||||
|
||||
|
||||
def test_dump_nonascii_string():
|
||||
content = 'name = "Stažené"\n'
|
||||
toml_content = contoml.dumps(contoml.loads(content))
|
||||
assert toml_content == content
|
||||
|
||||
Reference in New Issue
Block a user