diff --git a/HISTORY.md b/HISTORY.md index f3a1a34..945608c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -12,6 +12,9 @@ - Formats can now be dynamically registered through the `tablib.formats.registry.refister` API (#256). +- Tablib methods expecting data input (`detect_format`, `import_set`, + `Dataset.load`, `Databook.load`) now accepts file-like objects in addition + to raw strings and bytestrings (#440). ### Bugfixes diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 6226aef..23f4828 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -106,7 +106,8 @@ Importing Data -------------- Creating a :class:`tablib.Dataset` object by importing a pre-existing file is simple. :: - imported_data = Dataset().load(open('data.csv').read()) + with open('data.csv', 'r') as fh: + imported_data = Dataset().load(fh) This detects what sort of data is being passed in, and uses an appropriate formatter to do the import. So you can import from a variety of different file types. @@ -114,7 +115,8 @@ This detects what sort of data is being passed in, and uses an appropriate forma When the format is :class:`csv `, :class:`tsv `, :class:`dbf `, :class:`xls ` or :class:`xlsx `, and the data source does not have headers, the import should be done as follows :: - imported_data = Dataset().load(open('data.csv').read(), headers=False) + with open('data.csv', 'r') as fh: + imported_data = Dataset().load(fh, headers=False) -------------- Exporting Data @@ -320,7 +322,8 @@ Open an Excel Workbook and read first sheet Open an Excel 2007 and later workbook with a single sheet (or a workbook with multiple sheets but you just want the first sheet). :: data = tablib.Dataset() - data.xlsx = open('my_excel_file.xlsx', 'rb').read() + with open('my_excel_file.xlsx', 'rb') as fh: + data.load(fh, 'xlsx') print(data) Excel Workbook With Multiple Sheets diff --git a/src/tablib/core.py b/src/tablib/core.py index 9eaeb61..ef3c26e 100644 --- a/src/tablib/core.py +++ b/src/tablib/core.py @@ -21,6 +21,7 @@ from tablib.exceptions import ( UnsupportedFormat, ) from tablib.formats import registry +from tablib.utils import normalize_input __title__ = 'tablib' __author__ = 'Kenneth Reitz' @@ -239,8 +240,9 @@ class Dataset: def _get_in_format(self, fmt_key, **kwargs): return registry.get_format(fmt_key).export_set(self, **kwargs) - def _set_in_format(self, fmt_key, *args, **kwargs): - return registry.get_format(fmt_key).import_set(self, *args, **kwargs) + def _set_in_format(self, fmt_key, in_stream, **kwargs): + in_stream = normalize_input(in_stream) + return registry.get_format(fmt_key).import_set(self, in_stream, **kwargs) def _validate(self, row=None, col=None, safety=False): """Assures size of every row in dataset is of proper proportions.""" @@ -402,12 +404,14 @@ class Dataset: def load(self, in_stream, format=None, **kwargs): """ Import `in_stream` to the :class:`Dataset` object using the `format`. + `in_stream` can be a file-like object, a string, or a bytestring. :param \\*\\*kwargs: (optional) custom configuration to the format `import_set`. """ + stream = normalize_input(in_stream) if not format: - format = detect_format(in_stream) + format = detect_format(stream) fmt = registry.get_format(format) if not hasattr(fmt, 'import_set'): @@ -416,7 +420,7 @@ class Dataset: if not import_set: raise UnsupportedFormat('Format {} cannot be imported.'.format(format)) - fmt.import_set(self, in_stream, **kwargs) + fmt.import_set(self, stream, **kwargs) return self def export(self, format, **kwargs): @@ -861,18 +865,20 @@ class Databook: def load(self, in_stream, format, **kwargs): """ Import `in_stream` to the :class:`Databook` object using the `format`. + `in_stream` can be a file-like object, a string, or a bytestring. :param \\*\\*kwargs: (optional) custom configuration to the format `import_book`. """ + stream = normalize_input(in_stream) if not format: - format = detect_format(in_stream) + format = detect_format(stream) fmt = registry.get_format(format) if not hasattr(fmt, 'import_book'): raise UnsupportedFormat('Format {} cannot be loaded.'.format(format)) - fmt.import_book(self, in_stream, **kwargs) + fmt.import_book(self, stream, **kwargs) return self def export(self, format, **kwargs): @@ -889,25 +895,32 @@ class Databook: def detect_format(stream): - """Return format name of given stream.""" + """Return format name of given stream (file-like object, string, or bytestring).""" + stream = normalize_input(stream) + fmt_title = None for fmt in registry.formats(): try: if fmt.detect(stream): - return fmt.title + fmt_title = fmt.title + break except AttributeError: pass + finally: + if hasattr(stream, 'seek'): + stream.seek(0) + return fmt_title def import_set(stream, format=None, **kwargs): - """Return dataset of given stream.""" + """Return dataset of given stream (file-like object, string, or bytestring).""" - return Dataset().load(stream, format, **kwargs) + return Dataset().load(normalize_input(stream), format, **kwargs) def import_book(stream, format=None, **kwargs): - """Return dataset of given stream.""" + """Return dataset of given stream (file-like object, string, or bytestring).""" - return Databook().load(stream, format, **kwargs) + return Databook().load(normalize_input(stream), format, **kwargs) registry.register_builtins() diff --git a/src/tablib/formats/__init__.py b/src/tablib/formats/__init__.py index 1b5b0d6..848e665 100644 --- a/src/tablib/formats/__init__.py +++ b/src/tablib/formats/__init__.py @@ -6,6 +6,7 @@ from importlib import import_module from importlib.util import find_spec from tablib.exceptions import UnsupportedFormat +from tablib.utils import normalize_input from ._csv import CSVFormat from ._json import JSONFormat @@ -52,7 +53,7 @@ class ImportExportBookDescriptor(FormatDescriptorBase): def __set__(self, obj, val): self.ensure_format_loaded() - return self._format.import_book(obj, val) + return self._format.import_book(obj, normalize_input(val)) class ImportExportSetDescriptor(FormatDescriptorBase): @@ -62,7 +63,7 @@ class ImportExportSetDescriptor(FormatDescriptorBase): def __set__(self, obj, val): self.ensure_format_loaded() - return self._format.import_set(obj, val) + return self._format.import_set(obj, normalize_input(val)) class Registry: diff --git a/src/tablib/formats/_csv.py b/src/tablib/formats/_csv.py index cb209fc..14d7bb2 100644 --- a/src/tablib/formats/_csv.py +++ b/src/tablib/formats/_csv.py @@ -40,7 +40,7 @@ class CSVFormat: kwargs.setdefault('delimiter', cls.DEFAULT_DELIMITER) - rows = csv.reader(StringIO(in_stream), **kwargs) + rows = csv.reader(in_stream, **kwargs) for i, row in enumerate(rows): if (i == 0) and (headers): @@ -52,7 +52,7 @@ class CSVFormat: def detect(cls, stream, delimiter=None): """Returns True if given stream is valid CSV.""" try: - csv.Sniffer().sniff(stream[:1024], delimiters=delimiter or cls.DEFAULT_DELIMITER) + csv.Sniffer().sniff(stream.read(1024), delimiters=delimiter or cls.DEFAULT_DELIMITER) return True except Exception: return False diff --git a/src/tablib/formats/_dbf.py b/src/tablib/formats/_dbf.py index 4ff1604..7898cbd 100644 --- a/src/tablib/formats/_dbf.py +++ b/src/tablib/formats/_dbf.py @@ -50,7 +50,7 @@ class DBFFormat: """Returns a dataset from a DBF stream.""" dset.wipe() - _dbf = dbf.Dbf(io.BytesIO(in_stream)) + _dbf = dbf.Dbf(in_stream) dset.headers = _dbf.fieldNames for record in range(_dbf.recordCount): row = [_dbf[record][f] for f in _dbf.fieldNames] @@ -59,11 +59,8 @@ class DBFFormat: @classmethod def detect(cls, stream): """Returns True if the given stream is valid DBF""" - #_dbf = dbf.Table(StringIO(stream)) try: - if type(stream) is not bytes: - stream = bytes(stream, 'utf-8') - _dbf = dbf.Dbf(io.BytesIO(stream), readOnly=True) + _dbf = dbf.Dbf(stream, readOnly=True) return True except Exception: return False diff --git a/src/tablib/formats/_df.py b/src/tablib/formats/_df.py index b4cfa11..d8bf877 100644 --- a/src/tablib/formats/_df.py +++ b/src/tablib/formats/_df.py @@ -16,8 +16,10 @@ class DataFrameFormat: """Returns True if given stream is a DataFrame.""" if DataFrame is None: return False + elif isinstance(stream, DataFrame): + return True try: - DataFrame(stream) + DataFrame(stream.read()) return True except ValueError: return False diff --git a/src/tablib/formats/_json.py b/src/tablib/formats/_json.py index 99e2aaf..dd8c379 100644 --- a/src/tablib/formats/_json.py +++ b/src/tablib/formats/_json.py @@ -35,14 +35,14 @@ class JSONFormat: """Returns dataset from JSON stream.""" dset.wipe() - dset.dict = json.loads(in_stream) + dset.dict = json.load(in_stream) @classmethod def import_book(cls, dbook, in_stream): """Returns databook from JSON stream.""" dbook.wipe() - for sheet in json.loads(in_stream): + for sheet in json.load(in_stream): data = tablib.Dataset() data.title = sheet['title'] data.dict = sheet['data'] @@ -52,7 +52,7 @@ class JSONFormat: def detect(cls, stream): """Returns True if given stream is valid JSON.""" try: - json.loads(stream) + json.load(stream) return True except (TypeError, ValueError): return False diff --git a/src/tablib/formats/_xls.py b/src/tablib/formats/_xls.py index fd39b46..0b13d27 100644 --- a/src/tablib/formats/_xls.py +++ b/src/tablib/formats/_xls.py @@ -70,7 +70,7 @@ class XLSFormat: dset.wipe() - xls_book = xlrd.open_workbook(file_contents=in_stream) + xls_book = xlrd.open_workbook(file_contents=in_stream.read()) sheet = xls_book.sheet_by_index(0) dset.title = sheet.name diff --git a/src/tablib/formats/_xlsx.py b/src/tablib/formats/_xlsx.py index cc0a610..d7416b3 100644 --- a/src/tablib/formats/_xlsx.py +++ b/src/tablib/formats/_xlsx.py @@ -18,9 +18,6 @@ class XLSXFormat: @classmethod def detect(cls, stream): """Returns True if given stream is a readable excel file.""" - if isinstance(stream, bytes): - # load_workbook expects a file-like object. - stream = BytesIO(stream) try: openpyxl.reader.excel.load_workbook(stream, read_only=True) return True @@ -63,7 +60,7 @@ class XLSXFormat: dset.wipe() - xls_book = openpyxl.reader.excel.load_workbook(BytesIO(in_stream), read_only=True) + xls_book = openpyxl.reader.excel.load_workbook(in_stream, read_only=True) sheet = xls_book.active dset.title = sheet.title @@ -81,7 +78,7 @@ class XLSXFormat: dbook.wipe() - xls_book = openpyxl.reader.excel.load_workbook(BytesIO(in_stream), read_only=True) + xls_book = openpyxl.reader.excel.load_workbook(in_stream, read_only=True) for sheet in xls_book.worksheets: data = tablib.Dataset() diff --git a/src/tablib/utils.py b/src/tablib/utils.py new file mode 100644 index 0000000..39de8ce --- /dev/null +++ b/src/tablib/utils.py @@ -0,0 +1,13 @@ +from io import BytesIO, StringIO + + +def normalize_input(stream): + """ + Accept either a str/bytes stream or a file-like object and always return a + file-like object. + """ + if isinstance(stream, str): + return StringIO(stream) + elif isinstance(stream, bytes): + return BytesIO(stream) + return stream diff --git a/tests/files/founders.xlsx b/tests/files/founders.xlsx new file mode 100644 index 0000000..bd6e41e Binary files /dev/null and b/tests/files/founders.xlsx differ diff --git a/tests/test_tablib.py b/tests/test_tablib.py index 91df57f..e71105e 100755 --- a/tests/test_tablib.py +++ b/tests/test_tablib.py @@ -7,6 +7,7 @@ import json import pickle import unittest from collections import OrderedDict +from io import BytesIO, StringIO from pathlib import Path from uuid import uuid4 @@ -302,6 +303,18 @@ class TablibTestCase(BaseTestCase): with self.assertRaises(UnsupportedFormat): book.export('csv') + def test_book_import_from_file(self): + xlsx_source = Path(__file__).parent / 'files' / 'founders.xlsx' + with open(str(xlsx_source), mode='rb') as fh: + book = tablib.Databook().load(fh, 'xlsx') + self.assertEqual(eval(book.json)[0]['title'], 'Feuille1') + + def test_dataset_import_from_file(self): + xlsx_source = Path(__file__).parent / 'files' / 'founders.xlsx' + with open(str(xlsx_source), mode='rb') as fh: + dset = tablib.Dataset().load(fh, 'xlsx') + self.assertEqual(eval(dset.json)[0]['last_name'], 'Adams') + def test_auto_format_detect(self): """Test auto format detection.""" # html, jira, latex, rst are export only. @@ -330,7 +343,9 @@ class TablibTestCase(BaseTestCase): _tsv = '1\t2\t3\n4\t5\t6\n7\t8\t9\n' self.assertEqual(tablib.detect_format(_tsv), 'tsv') - _bunk = '¡¡¡¡¡¡---///\n\n\n¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶' + _bunk = StringIO( + '¡¡¡¡¡¡---///\n\n\n¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶' + ) self.assertEqual(tablib.detect_format(_bunk), None) def test_transpose(self): @@ -692,12 +707,12 @@ class CSVTests(BaseTestCase): def test_csv_format_detect(self): """Test CSV format detection.""" - _csv = ( + _csv = StringIO( '1,2,3\n' '4,5,6\n' '7,8,9\n' ) - _bunk = ( + _bunk = StringIO( '¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶' ) @@ -915,12 +930,12 @@ class TSVTests(BaseTestCase): def test_tsv_format_detect(self): """Test TSV format detection.""" - _tsv = ( + _tsv = StringIO( '1\t2\t3\n' '4\t5\t6\n' '7\t8\t9\n' ) - _bunk = ( + _bunk = StringIO( '¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶' ) @@ -999,8 +1014,8 @@ class JSONTests(BaseTestCase): def test_json_format_detect(self): """Test JSON format detection.""" - _json = '[{"last_name": "Adams","age": 90,"first_name": "John"}]' - _bunk = ( + _json = StringIO('[{"last_name": "Adams","age": 90,"first_name": "John"}]') + _bunk = StringIO( '¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶' ) @@ -1251,6 +1266,7 @@ class DBFTests(BaseTestCase): _dbf += b' Jefferson' + (b' ' * 70) _dbf += b' 50.0000000' _dbf += b'\x1a' + _dbf = BytesIO(_dbf) _yaml = '- {age: 90, first_name: John, last_name: Adams}' _tsv = 'foo\tbar'