Fixes #440 -Normalize stream inputs as IO streams

This commit is contained in:
Claude Paroz
2019-11-07 22:37:49 +01:00
parent 6152d995f0
commit 660990b6b0
13 changed files with 86 additions and 41 deletions
+3
View File
@@ -12,6 +12,9 @@
- Formats can now be dynamically registered through the
`tablib.formats.registry.refister` API (#256).
- Tablib methods expecting data input (`detect_format`, `import_set`,
`Dataset.load`, `Databook.load`) now accepts file-like objects in addition
to raw strings and bytestrings (#440).
### Bugfixes
+6 -3
View File
@@ -106,7 +106,8 @@ Importing Data
--------------
Creating a :class:`tablib.Dataset` object by importing a pre-existing file is simple. ::
imported_data = Dataset().load(open('data.csv').read())
with open('data.csv', 'r') as fh:
imported_data = Dataset().load(fh)
This detects what sort of data is being passed in, and uses an appropriate formatter to do the import. So you can import from a variety of different file types.
@@ -114,7 +115,8 @@ This detects what sort of data is being passed in, and uses an appropriate forma
When the format is :class:`csv <Dataset.csv>`, :class:`tsv <Dataset.tsv>`, :class:`dbf <Dataset.dbf>`, :class:`xls <Dataset.xls>` or :class:`xlsx <Dataset.xlsx>`, and the data source does not have headers, the import should be done as follows ::
imported_data = Dataset().load(open('data.csv').read(), headers=False)
with open('data.csv', 'r') as fh:
imported_data = Dataset().load(fh, headers=False)
--------------
Exporting Data
@@ -320,7 +322,8 @@ Open an Excel Workbook and read first sheet
Open an Excel 2007 and later workbook with a single sheet (or a workbook with multiple sheets but you just want the first sheet). ::
data = tablib.Dataset()
data.xlsx = open('my_excel_file.xlsx', 'rb').read()
with open('my_excel_file.xlsx', 'rb') as fh:
data.load(fh, 'xlsx')
print(data)
Excel Workbook With Multiple Sheets
+25 -12
View File
@@ -21,6 +21,7 @@ from tablib.exceptions import (
UnsupportedFormat,
)
from tablib.formats import registry
from tablib.utils import normalize_input
__title__ = 'tablib'
__author__ = 'Kenneth Reitz'
@@ -239,8 +240,9 @@ class Dataset:
def _get_in_format(self, fmt_key, **kwargs):
return registry.get_format(fmt_key).export_set(self, **kwargs)
def _set_in_format(self, fmt_key, *args, **kwargs):
return registry.get_format(fmt_key).import_set(self, *args, **kwargs)
def _set_in_format(self, fmt_key, in_stream, **kwargs):
in_stream = normalize_input(in_stream)
return registry.get_format(fmt_key).import_set(self, in_stream, **kwargs)
def _validate(self, row=None, col=None, safety=False):
"""Assures size of every row in dataset is of proper proportions."""
@@ -402,12 +404,14 @@ class Dataset:
def load(self, in_stream, format=None, **kwargs):
"""
Import `in_stream` to the :class:`Dataset` object using the `format`.
`in_stream` can be a file-like object, a string, or a bytestring.
:param \\*\\*kwargs: (optional) custom configuration to the format `import_set`.
"""
stream = normalize_input(in_stream)
if not format:
format = detect_format(in_stream)
format = detect_format(stream)
fmt = registry.get_format(format)
if not hasattr(fmt, 'import_set'):
@@ -416,7 +420,7 @@ class Dataset:
if not import_set:
raise UnsupportedFormat('Format {} cannot be imported.'.format(format))
fmt.import_set(self, in_stream, **kwargs)
fmt.import_set(self, stream, **kwargs)
return self
def export(self, format, **kwargs):
@@ -861,18 +865,20 @@ class Databook:
def load(self, in_stream, format, **kwargs):
"""
Import `in_stream` to the :class:`Databook` object using the `format`.
`in_stream` can be a file-like object, a string, or a bytestring.
:param \\*\\*kwargs: (optional) custom configuration to the format `import_book`.
"""
stream = normalize_input(in_stream)
if not format:
format = detect_format(in_stream)
format = detect_format(stream)
fmt = registry.get_format(format)
if not hasattr(fmt, 'import_book'):
raise UnsupportedFormat('Format {} cannot be loaded.'.format(format))
fmt.import_book(self, in_stream, **kwargs)
fmt.import_book(self, stream, **kwargs)
return self
def export(self, format, **kwargs):
@@ -889,25 +895,32 @@ class Databook:
def detect_format(stream):
"""Return format name of given stream."""
"""Return format name of given stream (file-like object, string, or bytestring)."""
stream = normalize_input(stream)
fmt_title = None
for fmt in registry.formats():
try:
if fmt.detect(stream):
return fmt.title
fmt_title = fmt.title
break
except AttributeError:
pass
finally:
if hasattr(stream, 'seek'):
stream.seek(0)
return fmt_title
def import_set(stream, format=None, **kwargs):
"""Return dataset of given stream."""
"""Return dataset of given stream (file-like object, string, or bytestring)."""
return Dataset().load(stream, format, **kwargs)
return Dataset().load(normalize_input(stream), format, **kwargs)
def import_book(stream, format=None, **kwargs):
"""Return dataset of given stream."""
"""Return dataset of given stream (file-like object, string, or bytestring)."""
return Databook().load(stream, format, **kwargs)
return Databook().load(normalize_input(stream), format, **kwargs)
registry.register_builtins()
+3 -2
View File
@@ -6,6 +6,7 @@ from importlib import import_module
from importlib.util import find_spec
from tablib.exceptions import UnsupportedFormat
from tablib.utils import normalize_input
from ._csv import CSVFormat
from ._json import JSONFormat
@@ -52,7 +53,7 @@ class ImportExportBookDescriptor(FormatDescriptorBase):
def __set__(self, obj, val):
self.ensure_format_loaded()
return self._format.import_book(obj, val)
return self._format.import_book(obj, normalize_input(val))
class ImportExportSetDescriptor(FormatDescriptorBase):
@@ -62,7 +63,7 @@ class ImportExportSetDescriptor(FormatDescriptorBase):
def __set__(self, obj, val):
self.ensure_format_loaded()
return self._format.import_set(obj, val)
return self._format.import_set(obj, normalize_input(val))
class Registry:
+2 -2
View File
@@ -40,7 +40,7 @@ class CSVFormat:
kwargs.setdefault('delimiter', cls.DEFAULT_DELIMITER)
rows = csv.reader(StringIO(in_stream), **kwargs)
rows = csv.reader(in_stream, **kwargs)
for i, row in enumerate(rows):
if (i == 0) and (headers):
@@ -52,7 +52,7 @@ class CSVFormat:
def detect(cls, stream, delimiter=None):
"""Returns True if given stream is valid CSV."""
try:
csv.Sniffer().sniff(stream[:1024], delimiters=delimiter or cls.DEFAULT_DELIMITER)
csv.Sniffer().sniff(stream.read(1024), delimiters=delimiter or cls.DEFAULT_DELIMITER)
return True
except Exception:
return False
+2 -5
View File
@@ -50,7 +50,7 @@ class DBFFormat:
"""Returns a dataset from a DBF stream."""
dset.wipe()
_dbf = dbf.Dbf(io.BytesIO(in_stream))
_dbf = dbf.Dbf(in_stream)
dset.headers = _dbf.fieldNames
for record in range(_dbf.recordCount):
row = [_dbf[record][f] for f in _dbf.fieldNames]
@@ -59,11 +59,8 @@ class DBFFormat:
@classmethod
def detect(cls, stream):
"""Returns True if the given stream is valid DBF"""
#_dbf = dbf.Table(StringIO(stream))
try:
if type(stream) is not bytes:
stream = bytes(stream, 'utf-8')
_dbf = dbf.Dbf(io.BytesIO(stream), readOnly=True)
_dbf = dbf.Dbf(stream, readOnly=True)
return True
except Exception:
return False
+3 -1
View File
@@ -16,8 +16,10 @@ class DataFrameFormat:
"""Returns True if given stream is a DataFrame."""
if DataFrame is None:
return False
elif isinstance(stream, DataFrame):
return True
try:
DataFrame(stream)
DataFrame(stream.read())
return True
except ValueError:
return False
+3 -3
View File
@@ -35,14 +35,14 @@ class JSONFormat:
"""Returns dataset from JSON stream."""
dset.wipe()
dset.dict = json.loads(in_stream)
dset.dict = json.load(in_stream)
@classmethod
def import_book(cls, dbook, in_stream):
"""Returns databook from JSON stream."""
dbook.wipe()
for sheet in json.loads(in_stream):
for sheet in json.load(in_stream):
data = tablib.Dataset()
data.title = sheet['title']
data.dict = sheet['data']
@@ -52,7 +52,7 @@ class JSONFormat:
def detect(cls, stream):
"""Returns True if given stream is valid JSON."""
try:
json.loads(stream)
json.load(stream)
return True
except (TypeError, ValueError):
return False
+1 -1
View File
@@ -70,7 +70,7 @@ class XLSFormat:
dset.wipe()
xls_book = xlrd.open_workbook(file_contents=in_stream)
xls_book = xlrd.open_workbook(file_contents=in_stream.read())
sheet = xls_book.sheet_by_index(0)
dset.title = sheet.name
+2 -5
View File
@@ -18,9 +18,6 @@ class XLSXFormat:
@classmethod
def detect(cls, stream):
"""Returns True if given stream is a readable excel file."""
if isinstance(stream, bytes):
# load_workbook expects a file-like object.
stream = BytesIO(stream)
try:
openpyxl.reader.excel.load_workbook(stream, read_only=True)
return True
@@ -63,7 +60,7 @@ class XLSXFormat:
dset.wipe()
xls_book = openpyxl.reader.excel.load_workbook(BytesIO(in_stream), read_only=True)
xls_book = openpyxl.reader.excel.load_workbook(in_stream, read_only=True)
sheet = xls_book.active
dset.title = sheet.title
@@ -81,7 +78,7 @@ class XLSXFormat:
dbook.wipe()
xls_book = openpyxl.reader.excel.load_workbook(BytesIO(in_stream), read_only=True)
xls_book = openpyxl.reader.excel.load_workbook(in_stream, read_only=True)
for sheet in xls_book.worksheets:
data = tablib.Dataset()
+13
View File
@@ -0,0 +1,13 @@
from io import BytesIO, StringIO
def normalize_input(stream):
"""
Accept either a str/bytes stream or a file-like object and always return a
file-like object.
"""
if isinstance(stream, str):
return StringIO(stream)
elif isinstance(stream, bytes):
return BytesIO(stream)
return stream
Binary file not shown.
+23 -7
View File
@@ -7,6 +7,7 @@ import json
import pickle
import unittest
from collections import OrderedDict
from io import BytesIO, StringIO
from pathlib import Path
from uuid import uuid4
@@ -302,6 +303,18 @@ class TablibTestCase(BaseTestCase):
with self.assertRaises(UnsupportedFormat):
book.export('csv')
def test_book_import_from_file(self):
xlsx_source = Path(__file__).parent / 'files' / 'founders.xlsx'
with open(str(xlsx_source), mode='rb') as fh:
book = tablib.Databook().load(fh, 'xlsx')
self.assertEqual(eval(book.json)[0]['title'], 'Feuille1')
def test_dataset_import_from_file(self):
xlsx_source = Path(__file__).parent / 'files' / 'founders.xlsx'
with open(str(xlsx_source), mode='rb') as fh:
dset = tablib.Dataset().load(fh, 'xlsx')
self.assertEqual(eval(dset.json)[0]['last_name'], 'Adams')
def test_auto_format_detect(self):
"""Test auto format detection."""
# html, jira, latex, rst are export only.
@@ -330,7 +343,9 @@ class TablibTestCase(BaseTestCase):
_tsv = '1\t2\t3\n4\t5\t6\n7\t8\t9\n'
self.assertEqual(tablib.detect_format(_tsv), 'tsv')
_bunk = '¡¡¡¡¡¡---///\n\n\n¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
_bunk = StringIO(
'¡¡¡¡¡¡---///\n\n\n¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)
self.assertEqual(tablib.detect_format(_bunk), None)
def test_transpose(self):
@@ -692,12 +707,12 @@ class CSVTests(BaseTestCase):
def test_csv_format_detect(self):
"""Test CSV format detection."""
_csv = (
_csv = StringIO(
'1,2,3\n'
'4,5,6\n'
'7,8,9\n'
)
_bunk = (
_bunk = StringIO(
'¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)
@@ -915,12 +930,12 @@ class TSVTests(BaseTestCase):
def test_tsv_format_detect(self):
"""Test TSV format detection."""
_tsv = (
_tsv = StringIO(
'1\t2\t3\n'
'4\t5\t6\n'
'7\t8\t9\n'
)
_bunk = (
_bunk = StringIO(
'¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)
@@ -999,8 +1014,8 @@ class JSONTests(BaseTestCase):
def test_json_format_detect(self):
"""Test JSON format detection."""
_json = '[{"last_name": "Adams","age": 90,"first_name": "John"}]'
_bunk = (
_json = StringIO('[{"last_name": "Adams","age": 90,"first_name": "John"}]')
_bunk = StringIO(
'¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)
@@ -1251,6 +1266,7 @@ class DBFTests(BaseTestCase):
_dbf += b' Jefferson' + (b' ' * 70)
_dbf += b' 50.0000000'
_dbf += b'\x1a'
_dbf = BytesIO(_dbf)
_yaml = '- {age: 90, first_name: John, last_name: Adams}'
_tsv = 'foo\tbar'