Coverage for pyquickhelper/pandashelper/readh.py: 100%
25 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
1# -*- coding:utf-8 -*-
2"""
3@file
4@brief Various ways to import data into a dataframe
5"""
6import zipfile
7from io import StringIO, BytesIO
8from ..filehelper import read_content_ufs
11def read_csv(filepath_or_buffer, compression=None, fvalid=None, **params):
12 """
13 Reads a file from a file, it adds the compression zip
14 which was removed in the latest version,
15 see :epkg:`pandas:read_csv`.
17 @param filepath_or_buffer filepath or buffer
18 @param compression see :epkg:`pandas:read_csv`
19 @param params see :epkg:`pandas:read_csv`
20 @param fvalid if the zip file contains many files, this function
21 validates which one must be returned based on its name,
22 the function returns the content of the file in that case (bytes)
23 @return dataframe or a dictionary (name, dataframe)
25 See blog post :ref:`blogpost_read_csv`.
26 """
27 import pandas
28 if isinstance(filepath_or_buffer, str) and \
29 (compression == "zip" or (compression is None and
30 filepath_or_buffer.endswith(".zip"))):
31 content = read_content_ufs(filepath_or_buffer, asbytes=True)
32 with zipfile.ZipFile(BytesIO(content)) as myzip:
33 infos = myzip.infolist()
34 if not infos:
35 raise FileNotFoundError( # pragma: no cover
36 "unable to find a file in " + filepath_or_buffer)
37 res = {}
38 for info in infos:
39 name = info.filename
40 with myzip.open(name, "r") as f:
41 text = f.read()
42 if fvalid is not None and not fvalid(name):
43 res[name] = text
44 else:
45 if text is None:
46 raise FileNotFoundError( # pragma: no cover
47 f"Empty file '{name}' in '{filepath_or_buffer}'")
48 text = text.decode(
49 encoding=params.get('encoding', 'ascii'))
50 st = StringIO(text)
51 try:
52 df = pandas.read_csv(
53 st, compression=compression, **params)
54 except pandas.errors.ParserError as e: # pragma: no cover
55 lines = text.split("\n")
56 end = min(len(lines), 5)
57 mes = "Parsing errors in '{0}', first lines:\n{1}".format(
58 name, "\n".join(lines[:end]))
59 raise RuntimeError(mes) from e
60 res[name] = df
61 return res if len(res) > 1 else list(res.values())[0]
62 else:
63 return pandas.read_csv( # pragma: no cover
64 filepath_or_buffer, compression=compression, **params)