Source code for pyquickhelper.pandashelper.readh
# -*- coding:utf-8 -*-
"""
Various ways to import data into a dataframe
:githublink:`%|py|6`
"""
import zipfile
from io import StringIO, BytesIO
from ..filehelper import read_content_ufs
[docs]def read_csv(filepath_or_buffer, compression=None, fvalid=None, **params):
"""
Reads a file from a file, it adds the compression zip
which was removed in the latest version,
see :epkg:`pandas:read_csv`.
:param filepath_or_buffer: filepath or buffer
:param compression: see :epkg:`pandas:read_csv`
:param params: see :epkg:`pandas:read_csv`
:param fvalid: if the zip file contains many files, this function
validates which one must be returned based on its name,
the function returns the content of the file in that case (bytes)
:return: dataframe or a dictionary (name, dataframe)
See blog post :ref:`blogpost_read_csv`.
:githublink:`%|py|26`
"""
import pandas
if isinstance(filepath_or_buffer, str) and \
(compression == "zip" or (compression is None and
filepath_or_buffer.endswith(".zip"))):
content = read_content_ufs(filepath_or_buffer, asbytes=True)
with zipfile.ZipFile(BytesIO(content)) as myzip:
infos = myzip.infolist()
if not infos:
raise FileNotFoundError( # pragma: no cover
"unable to find a file in " + filepath_or_buffer)
res = {}
for info in infos:
name = info.filename
with myzip.open(name, "r") as f:
text = f.read()
if fvalid is not None and not fvalid(name):
res[name] = text
else:
if text is None:
raise FileNotFoundError( # pragma: no cover
"Empty file '{0}' in '{1}'".format(
name, filepath_or_buffer))
text = text.decode(
encoding=params.get('encoding', 'ascii'))
st = StringIO(text)
try:
df = pandas.read_csv(
st, compression=compression, **params)
except pandas.errors.ParserError as e: # pragma: no cover
lines = text.split("\n")
end = min(len(lines), 5)
mes = "Parsing errors in '{0}', first lines:\n{1}".format(
name, "\n".join(lines[:end]))
raise Exception(mes) from e
res[name] = df
return res if len(res) > 1 else list(res.values())[0]
else:
return pandas.read_csv( # pragma: no cover
filepath_or_buffer, compression=compression, **params)