From 170446e8c93bce614721e2abc6731663164e9902 Mon Sep 17 00:00:00 2001 From: Kenneth Reitz Date: Fri, 27 Jan 2017 00:24:59 -0500 Subject: [PATCH] init --- Pipfile | 9 +++++++ README.rst | 39 +++++++++++++++++++++++++++++++ ovaltine.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 Pipfile create mode 100644 README.rst create mode 100644 ovaltine.py diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..edddb49 --- /dev/null +++ b/Pipfile @@ -0,0 +1,9 @@ +[[source]] +url = "https://pypi.python.org/simple" +verify_ssl = true + +[dev-packages] +requests = "*" + +[packages] +chardet = "*" diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..75d2989 --- /dev/null +++ b/README.rst @@ -0,0 +1,39 @@ +Ovaltine: a Secret Encoding Decoder Ring for Python +=================================================== + +**Ovaltine** (extracted from Requests) is a simple Python library for +working with unicode embedded within an undeterministic encoding. + +For example, many web servers lie about what encoding their responses are. +You can use Ovaltine, which in turn uses chardet, to detect the apparent +encoding and get back as much usable data as possible. + +Usage +----- + +Simply get back unicode, no matter what:: + + >>> import ovaltine + + >>> ovaltine.drink(b'foo bar') + u'foo bar' + +Or, for more advanced usage:: + + >>> from ovaltine import DecoderRing + + >>> content = requests.get('https://kennethreitz.org/').content + >>> r = DecoderRing(content) + + >>> r + + >>> r.apparently + 'ISO-8859-2' + >>> r.text + ... # Unicode is shown here. + + # Set the encoding yourself. + >>> r.encoding = 'UTF-8' + >>> r.text + ... # Unicode is shown here. + diff --git a/ovaltine.py b/ovaltine.py new file mode 100644 index 0000000..6faa808 --- /dev/null +++ b/ovaltine.py @@ -0,0 +1,67 @@ +import sys + +import chardet + +# Version Hacking +# --------------- + +_ver = sys.version_info +is_py2 = (_ver[0] == 2) +is_py3 = (_ver[0] == 3) + +if is_py2: + str = unicode +elif is_py3: + str = str + + +def drink(content, encoding=None): + """Will attempt to turn any given bytes into unicode, and attempt + to use the given encoding. + """ + + ring = DecoderRing(content) + ring.encoding = encoding + return ring.text + + +class DecoderRing(object): + """A secret decoder ring, which decodes secret messages (e.g. bytes + of unknown, or simply undeterministic, encoding). + """ + def __init__(self, content): + super(DecoderRing, self).__init__() + self.content = content + self.encoding = None + + def __repr__(self): + return ''.format(len(self.content), self.encoding, self.apparently) + + @property + def apparently(self): + """Returns the apparent encoding of the content.""" + return chardet.detect(self.content)['encoding'] + + @property + def text(self): + """Returns the unicode representation of the content.""" + + encoding = self.encoding + + # Fallback to auto-detected encoding. + if self.encoding is None: + encoding = self.apparently + + # Decode unicode from given encoding. + try: + content = str(self.content, encoding, errors='replace') + except (LookupError, TypeError): + # A LookupError is raised if the encoding was not found which could + # indicate a misspelling or similar mistake. + # + # A TypeError can be raised if encoding is None. + # + # So we try blindly encoding. + content = str(self.content, error='replace') + + return content