Merge pull request #3176 from frostming/non-ascii-toml

Fix non-ASCII support for prettytoml
2026-06-05 22:50:18 +00:00 · 2018-11-07 17:17:17 -05:00
parent c55fed7845 fde06b3396
commit 6b13d5a68d
5 changed files with 201 additions and 40 deletions
@@ -0,0 +1 @@
+Handle non-ASCII characters correctly in TOML.
@@ -2,6 +2,7 @@
 """
 A converter of python values to TOML Token instances.
 """
+from __future__ import unicode_literals
 import codecs
 import datetime
 import six
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow


 def _escape_single_line_quoted_string(text):
-    if six.PY2:
-        return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
-    else:
-        return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
+    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+    start = 0
+    i = 0
+    res = []
+    _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
+                '\b': '\\b', '\f': '\\f', '"': '\\"'}
+
+    def flush():
+        if start < i:
+            res.append(text[start:i])
+        return i + 1
+
+    while i < len(text):
+        c = text[i]
+        if c in _escapes:
+            start = flush()
+            res.append(_escapes[c])
+        elif ord(c) < 0x20:
+            start = flush()
+            res.append('\\u%04x' % ord(c))
+        i += 1
+
+    flush()
+    return ''.join(res)


 def _create_multiline_string_token(text):
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
 import re
 import string
 import iso8601
@@ -19,7 +20,7 @@ def deserialize(token):

    Raises DeserializationError when appropriate.
    """
-    
+
    if token.type == TYPE_BOOLEAN:
        return _to_boolean(token)
    elif token.type == TYPE_INTEGER:
@@ -39,42 +40,40 @@ def _unescape_str(text):
    """
    Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
    """
-
-    # Detect bad escape jobs
-    bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
-    if bad_escape_regexp.findall(text):
-        raise BadEscapeCharacter
-
-    # Do the unescaping
-    if six.PY2:
-        return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
-    else:
-        return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
-
-
-def _unicode_escaped_string(text):
-    """
-    Escapes all unicode characters in the given string
-    """
-
-    if six.PY2:
-        text = unicode(text)
-
-    def is_unicode(c):
-        return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
-
-    def escape_unicode_char(x):
-        if six.PY2:
-            return x.encode('unicode-escape')
+    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+    tokens = []
+    i = 0
+    basicstr_re = re.compile(r'[^"\\\000-\037]*')
+    unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
+    escapes = {
+        'b': '\b',
+        't': '\t',
+        'n': '\n',
+        'f': '\f',
+        'r': '\r',
+        '\\': '\\',
+        '"': '"',
+        '/': '/',
+        "'": "'"
+    }
+    while True:
+        m = basicstr_re.match(text, i)
+        i = m.end()
+        tokens.append(m.group())
+        if i == len(text) or text[i] != '\\':
+            break
        else:
-            return codecs.encode(x, 'unicode-escape')
-
-    if any(is_unicode(c) for c in text):
-        homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
-        homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
-        return homogeneous_bytes.decode()
-    else:
-        return text
+            i += 1
+        if unicode_re.match(text, i):
+            m = unicode_re.match(text, i)
+            i = m.end()
+            tokens.append(six.unichr(int(m.group(1), 16)))
+        else:
+            if text[i] not in escapes:
+                raise BadEscapeCharacter
+            tokens.append(escapes[text[i]])
+            i += 1
+    return ''.join(tokens)


 def _to_string(token):
@@ -0,0 +1,132 @@
+diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
+index 8299195..2decd02 100644
+--- a/pipenv/patched/prettytoml/tokens/py2toml.py
+++ b/pipenv/patched/prettytoml/tokens/py2toml.py
+@@ -2,6 +2,7 @@
+ """
+ A converter of python values to TOML Token instances.
+ """
+from __future__ import unicode_literals
+ import codecs
+ import datetime
+ import six
+@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
+
+
+ def _escape_single_line_quoted_string(text):
+-    if six.PY2:
+-        return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
+-    else:
+-        return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
+    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+    start = 0
+    i = 0
+    res = []
+    _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
+                '\b': '\\b', '\f': '\\f', '"': '\\"'}
+
+    def flush():
+        if start < i:
+            res.append(text[start:i])
+        return i + 1
+
+    while i < len(text):
+        c = text[i]
+        if c in _escapes:
+            start = flush()
+            res.append(_escapes[c])
+        elif ord(c) < 0x20:
+            start = flush()
+            res.append('\\u%04x' % ord(c))
+        i += 1
+
+    flush()
+    return ''.join(res)
+
+
+ def _create_multiline_string_token(text):
+diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
+index 2bf9c1c..5680443 100644
+--- a/pipenv/patched/prettytoml/tokens/toml2py.py
+++ b/pipenv/patched/prettytoml/tokens/toml2py.py
+@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
+ import re
+ import string
+ import iso8601
+@@ -39,42 +40,40 @@ def _unescape_str(text):
+     """
+     Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
+     """
+-
+-    # Detect bad escape jobs
+-    bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
+-    if bad_escape_regexp.findall(text):
+-        raise BadEscapeCharacter
+-
+-    # Do the unescaping
+-    if six.PY2:
+-        return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
+-    else:
+-        return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
+-
+-
+-def _unicode_escaped_string(text):
+-    """
+-    Escapes all unicode characters in the given string
+-    """
+-
+-    if six.PY2:
+-        text = unicode(text)
+-
+-    def is_unicode(c):
+-        return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
+-
+-    def escape_unicode_char(x):
+-        if six.PY2:
+-            return x.encode('unicode-escape')
+    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+    tokens = []
+    i = 0
+    basicstr_re = re.compile(r'[^"\\\000-\037]*')
+    unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
+    escapes = {
+        'b': '\b',
+        't': '\t',
+        'n': '\n',
+        'f': '\f',
+        'r': '\r',
+        '\\': '\\',
+        '"': '"',
+        '/': '/',
+        "'": "'"
+    }
+    while True:
+        m = basicstr_re.match(text, i)
+        i = m.end()
+        tokens.append(m.group())
+        if i == len(text) or text[i] != '\\':
+            break
+         else:
+-            return codecs.encode(x, 'unicode-escape')
+-
+-    if any(is_unicode(c) for c in text):
+-        homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
+-        homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
+-        return homogeneous_bytes.decode()
+-    else:
+-        return text
+            i += 1
+        if unicode_re.match(text, i):
+            m = unicode_re.match(text, i)
+            i = m.end()
+            tokens.append(six.unichr(int(m.group(1), 16)))
+        else:
+            if text[i] not in escapes:
+                raise BadEscapeCharacter
+            tokens.append(escapes[text[i]])
+            i += 1
+    return ''.join(tokens)
+
+
+ def _to_string(token):
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # We need to import the patched packages directly from sys.path, so the
 # identity checks can pass.
 import pipenv  # noqa
@@ -8,6 +9,7 @@ import os
 import pytest
 import pytz

+import contoml
 from pipfile.api import PipfileParser
 from prettytoml import lexer, tokens
 from prettytoml.elements.atomic import AtomicElement
@@ -104,3 +106,9 @@ class TestPipfileParser:
 def test_token_date(dt, content):
    token = create_primitive_token(dt)
    assert token == tokens.Token(tokens.TYPE_DATE, content)
+
+
+def test_dump_nonascii_string():
+    content = 'name = "Stažené"\n'
+    toml_content = contoml.dumps(contoml.loads(content))
+    assert toml_content == content
				`@@ -0,0 +1 @@`
				`Handle non-ASCII characters correctly in TOML.`