Merge pull request #3176 from frostming/non-ascii-toml

Fix non-ASCII support for prettytoml
This commit is contained in:
Dan Ryan
2018-11-07 17:17:17 -05:00
committed by GitHub
5 changed files with 201 additions and 40 deletions
+1
View File
@@ -0,0 +1 @@
Handle non-ASCII characters correctly in TOML.
+25 -4
View File
@@ -2,6 +2,7 @@
"""
A converter of python values to TOML Token instances.
"""
from __future__ import unicode_literals
import codecs
import datetime
import six
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
def _escape_single_line_quoted_string(text):
if six.PY2:
return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
else:
return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
start = 0
i = 0
res = []
_escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
'\b': '\\b', '\f': '\\f', '"': '\\"'}
def flush():
if start < i:
res.append(text[start:i])
return i + 1
while i < len(text):
c = text[i]
if c in _escapes:
start = flush()
res.append(_escapes[c])
elif ord(c) < 0x20:
start = flush()
res.append('\\u%04x' % ord(c))
i += 1
flush()
return ''.join(res)
def _create_multiline_string_token(text):
+35 -36
View File
@@ -1,3 +1,4 @@
from __future__ import unicode_literals
import re
import string
import iso8601
@@ -19,7 +20,7 @@ def deserialize(token):
Raises DeserializationError when appropriate.
"""
if token.type == TYPE_BOOLEAN:
return _to_boolean(token)
elif token.type == TYPE_INTEGER:
@@ -39,42 +40,40 @@ def _unescape_str(text):
"""
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
"""
# Detect bad escape jobs
bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
if bad_escape_regexp.findall(text):
raise BadEscapeCharacter
# Do the unescaping
if six.PY2:
return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
else:
return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
def _unicode_escaped_string(text):
"""
Escapes all unicode characters in the given string
"""
if six.PY2:
text = unicode(text)
def is_unicode(c):
return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
def escape_unicode_char(x):
if six.PY2:
return x.encode('unicode-escape')
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
tokens = []
i = 0
basicstr_re = re.compile(r'[^"\\\000-\037]*')
unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
escapes = {
'b': '\b',
't': '\t',
'n': '\n',
'f': '\f',
'r': '\r',
'\\': '\\',
'"': '"',
'/': '/',
"'": "'"
}
while True:
m = basicstr_re.match(text, i)
i = m.end()
tokens.append(m.group())
if i == len(text) or text[i] != '\\':
break
else:
return codecs.encode(x, 'unicode-escape')
if any(is_unicode(c) for c in text):
homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
return homogeneous_bytes.decode()
else:
return text
i += 1
if unicode_re.match(text, i):
m = unicode_re.match(text, i)
i = m.end()
tokens.append(six.unichr(int(m.group(1), 16)))
else:
if text[i] not in escapes:
raise BadEscapeCharacter
tokens.append(escapes[text[i]])
i += 1
return ''.join(tokens)
def _to_string(token):
@@ -0,0 +1,132 @@
diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
index 8299195..2decd02 100644
--- a/pipenv/patched/prettytoml/tokens/py2toml.py
+++ b/pipenv/patched/prettytoml/tokens/py2toml.py
@@ -2,6 +2,7 @@
"""
A converter of python values to TOML Token instances.
"""
+from __future__ import unicode_literals
import codecs
import datetime
import six
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
def _escape_single_line_quoted_string(text):
- if six.PY2:
- return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
- else:
- return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+ start = 0
+ i = 0
+ res = []
+ _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
+ '\b': '\\b', '\f': '\\f', '"': '\\"'}
+
+ def flush():
+ if start < i:
+ res.append(text[start:i])
+ return i + 1
+
+ while i < len(text):
+ c = text[i]
+ if c in _escapes:
+ start = flush()
+ res.append(_escapes[c])
+ elif ord(c) < 0x20:
+ start = flush()
+ res.append('\\u%04x' % ord(c))
+ i += 1
+
+ flush()
+ return ''.join(res)
def _create_multiline_string_token(text):
diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
index 2bf9c1c..5680443 100644
--- a/pipenv/patched/prettytoml/tokens/toml2py.py
+++ b/pipenv/patched/prettytoml/tokens/toml2py.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
import re
import string
import iso8601
@@ -39,42 +40,40 @@ def _unescape_str(text):
"""
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
"""
-
- # Detect bad escape jobs
- bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
- if bad_escape_regexp.findall(text):
- raise BadEscapeCharacter
-
- # Do the unescaping
- if six.PY2:
- return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
- else:
- return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
-
-
-def _unicode_escaped_string(text):
- """
- Escapes all unicode characters in the given string
- """
-
- if six.PY2:
- text = unicode(text)
-
- def is_unicode(c):
- return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
-
- def escape_unicode_char(x):
- if six.PY2:
- return x.encode('unicode-escape')
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+ tokens = []
+ i = 0
+ basicstr_re = re.compile(r'[^"\\\000-\037]*')
+ unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
+ escapes = {
+ 'b': '\b',
+ 't': '\t',
+ 'n': '\n',
+ 'f': '\f',
+ 'r': '\r',
+ '\\': '\\',
+ '"': '"',
+ '/': '/',
+ "'": "'"
+ }
+ while True:
+ m = basicstr_re.match(text, i)
+ i = m.end()
+ tokens.append(m.group())
+ if i == len(text) or text[i] != '\\':
+ break
else:
- return codecs.encode(x, 'unicode-escape')
-
- if any(is_unicode(c) for c in text):
- homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
- homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
- return homogeneous_bytes.decode()
- else:
- return text
+ i += 1
+ if unicode_re.match(text, i):
+ m = unicode_re.match(text, i)
+ i = m.end()
+ tokens.append(six.unichr(int(m.group(1), 16)))
+ else:
+ if text[i] not in escapes:
+ raise BadEscapeCharacter
+ tokens.append(escapes[text[i]])
+ i += 1
+ return ''.join(tokens)
def _to_string(token):
+8
View File
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# We need to import the patched packages directly from sys.path, so the
# identity checks can pass.
import pipenv # noqa
@@ -8,6 +9,7 @@ import os
import pytest
import pytz
import contoml
from pipfile.api import PipfileParser
from prettytoml import lexer, tokens
from prettytoml.elements.atomic import AtomicElement
@@ -104,3 +106,9 @@ class TestPipfileParser:
def test_token_date(dt, content):
token = create_primitive_token(dt)
assert token == tokens.Token(tokens.TYPE_DATE, content)
def test_dump_nonascii_string():
content = 'name = "Stažené"\n'
toml_content = contoml.dumps(contoml.loads(content))
assert toml_content == content