This commit is contained in:
2017-01-27 00:24:59 -05:00
commit 170446e8c9
3 changed files with 115 additions and 0 deletions
+9
View File
@@ -0,0 +1,9 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
[dev-packages]
requests = "*"
[packages]
chardet = "*"
+39
View File
@@ -0,0 +1,39 @@
Ovaltine: a Secret Encoding Decoder Ring for Python
===================================================
**Ovaltine** (extracted from Requests) is a simple Python library for
working with unicode embedded within an undeterministic encoding.
For example, many web servers lie about what encoding their responses are.
You can use Ovaltine, which in turn uses chardet, to detect the apparent
encoding and get back as much usable data as possible.
Usage
-----
Simply get back unicode, no matter what::
>>> import ovaltine
>>> ovaltine.drink(b'foo bar')
u'foo bar'
Or, for more advanced usage::
>>> from ovaltine import DecoderRing
>>> content = requests.get('https://kennethreitz.org/').content
>>> r = DecoderRing(content)
>>> r
<DecoderRing len=74773, encoding=None, apparently='ISO-8859-2'>
>>> r.apparently
'ISO-8859-2'
>>> r.text
... # Unicode is shown here.
# Set the encoding yourself.
>>> r.encoding = 'UTF-8'
>>> r.text
... # Unicode is shown here.
+67
View File
@@ -0,0 +1,67 @@
import sys
import chardet
# Version Hacking
# ---------------
_ver = sys.version_info
is_py2 = (_ver[0] == 2)
is_py3 = (_ver[0] == 3)
if is_py2:
str = unicode
elif is_py3:
str = str
def drink(content, encoding=None):
"""Will attempt to turn any given bytes into unicode, and attempt
to use the given encoding.
"""
ring = DecoderRing(content)
ring.encoding = encoding
return ring.text
class DecoderRing(object):
"""A secret decoder ring, which decodes secret messages (e.g. bytes
of unknown, or simply undeterministic, encoding).
"""
def __init__(self, content):
super(DecoderRing, self).__init__()
self.content = content
self.encoding = None
def __repr__(self):
return '<DecoderRing len={0!r}, encoding={1!r}, apparently={2!r}>'.format(len(self.content), self.encoding, self.apparently)
@property
def apparently(self):
"""Returns the apparent encoding of the content."""
return chardet.detect(self.content)['encoding']
@property
def text(self):
"""Returns the unicode representation of the content."""
encoding = self.encoding
# Fallback to auto-detected encoding.
if self.encoding is None:
encoding = self.apparently
# Decode unicode from given encoding.
try:
content = str(self.content, encoding, errors='replace')
except (LookupError, TypeError):
# A LookupError is raised if the encoding was not found which could
# indicate a misspelling or similar mistake.
#
# A TypeError can be raised if encoding is None.
#
# So we try blindly encoding.
content = str(self.content, error='replace')
return content