mirror of
https://github.com/kennethreitz-archive/ovaltine.git
synced 2026-06-05 07:16:11 +00:00
init
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
|
||||
[dev-packages]
|
||||
requests = "*"
|
||||
|
||||
[packages]
|
||||
chardet = "*"
|
||||
+39
@@ -0,0 +1,39 @@
|
||||
Ovaltine: a Secret Encoding Decoder Ring for Python
|
||||
===================================================
|
||||
|
||||
**Ovaltine** (extracted from Requests) is a simple Python library for
|
||||
working with unicode embedded within an undeterministic encoding.
|
||||
|
||||
For example, many web servers lie about what encoding their responses are.
|
||||
You can use Ovaltine, which in turn uses chardet, to detect the apparent
|
||||
encoding and get back as much usable data as possible.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Simply get back unicode, no matter what::
|
||||
|
||||
>>> import ovaltine
|
||||
|
||||
>>> ovaltine.drink(b'foo bar')
|
||||
u'foo bar'
|
||||
|
||||
Or, for more advanced usage::
|
||||
|
||||
>>> from ovaltine import DecoderRing
|
||||
|
||||
>>> content = requests.get('https://kennethreitz.org/').content
|
||||
>>> r = DecoderRing(content)
|
||||
|
||||
>>> r
|
||||
<DecoderRing len=74773, encoding=None, apparently='ISO-8859-2'>
|
||||
>>> r.apparently
|
||||
'ISO-8859-2'
|
||||
>>> r.text
|
||||
... # Unicode is shown here.
|
||||
|
||||
# Set the encoding yourself.
|
||||
>>> r.encoding = 'UTF-8'
|
||||
>>> r.text
|
||||
... # Unicode is shown here.
|
||||
|
||||
+67
@@ -0,0 +1,67 @@
|
||||
import sys
|
||||
|
||||
import chardet
|
||||
|
||||
# Version Hacking
|
||||
# ---------------
|
||||
|
||||
_ver = sys.version_info
|
||||
is_py2 = (_ver[0] == 2)
|
||||
is_py3 = (_ver[0] == 3)
|
||||
|
||||
if is_py2:
|
||||
str = unicode
|
||||
elif is_py3:
|
||||
str = str
|
||||
|
||||
|
||||
def drink(content, encoding=None):
|
||||
"""Will attempt to turn any given bytes into unicode, and attempt
|
||||
to use the given encoding.
|
||||
"""
|
||||
|
||||
ring = DecoderRing(content)
|
||||
ring.encoding = encoding
|
||||
return ring.text
|
||||
|
||||
|
||||
class DecoderRing(object):
|
||||
"""A secret decoder ring, which decodes secret messages (e.g. bytes
|
||||
of unknown, or simply undeterministic, encoding).
|
||||
"""
|
||||
def __init__(self, content):
|
||||
super(DecoderRing, self).__init__()
|
||||
self.content = content
|
||||
self.encoding = None
|
||||
|
||||
def __repr__(self):
|
||||
return '<DecoderRing len={0!r}, encoding={1!r}, apparently={2!r}>'.format(len(self.content), self.encoding, self.apparently)
|
||||
|
||||
@property
|
||||
def apparently(self):
|
||||
"""Returns the apparent encoding of the content."""
|
||||
return chardet.detect(self.content)['encoding']
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
"""Returns the unicode representation of the content."""
|
||||
|
||||
encoding = self.encoding
|
||||
|
||||
# Fallback to auto-detected encoding.
|
||||
if self.encoding is None:
|
||||
encoding = self.apparently
|
||||
|
||||
# Decode unicode from given encoding.
|
||||
try:
|
||||
content = str(self.content, encoding, errors='replace')
|
||||
except (LookupError, TypeError):
|
||||
# A LookupError is raised if the encoding was not found which could
|
||||
# indicate a misspelling or similar mistake.
|
||||
#
|
||||
# A TypeError can be raised if encoding is None.
|
||||
#
|
||||
# So we try blindly encoding.
|
||||
content = str(self.content, error='replace')
|
||||
|
||||
return content
|
||||
Reference in New Issue
Block a user