Coverage for pyquickhelper/pandashelper/readh.py: 100%

25 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 02:21 +0200

1# -*- coding:utf-8 -*- 

2""" 

3@file 

4@brief Various ways to import data into a dataframe 

5""" 

6import zipfile 

7from io import StringIO, BytesIO 

8from ..filehelper import read_content_ufs 

9 

10 

11def read_csv(filepath_or_buffer, compression=None, fvalid=None, **params): 

12 """ 

13 Reads a file from a file, it adds the compression zip 

14 which was removed in the latest version, 

15 see :epkg:`pandas:read_csv`. 

16 

17 @param filepath_or_buffer filepath or buffer 

18 @param compression see :epkg:`pandas:read_csv` 

19 @param params see :epkg:`pandas:read_csv` 

20 @param fvalid if the zip file contains many files, this function 

21 validates which one must be returned based on its name, 

22 the function returns the content of the file in that case (bytes) 

23 @return dataframe or a dictionary (name, dataframe) 

24 

25 See blog post :ref:`blogpost_read_csv`. 

26 """ 

27 import pandas 

28 if isinstance(filepath_or_buffer, str) and \ 

29 (compression == "zip" or (compression is None and 

30 filepath_or_buffer.endswith(".zip"))): 

31 content = read_content_ufs(filepath_or_buffer, asbytes=True) 

32 with zipfile.ZipFile(BytesIO(content)) as myzip: 

33 infos = myzip.infolist() 

34 if not infos: 

35 raise FileNotFoundError( # pragma: no cover 

36 "unable to find a file in " + filepath_or_buffer) 

37 res = {} 

38 for info in infos: 

39 name = info.filename 

40 with myzip.open(name, "r") as f: 

41 text = f.read() 

42 if fvalid is not None and not fvalid(name): 

43 res[name] = text 

44 else: 

45 if text is None: 

46 raise FileNotFoundError( # pragma: no cover 

47 f"Empty file '{name}' in '{filepath_or_buffer}'") 

48 text = text.decode( 

49 encoding=params.get('encoding', 'ascii')) 

50 st = StringIO(text) 

51 try: 

52 df = pandas.read_csv( 

53 st, compression=compression, **params) 

54 except pandas.errors.ParserError as e: # pragma: no cover 

55 lines = text.split("\n") 

56 end = min(len(lines), 5) 

57 mes = "Parsing errors in '{0}', first lines:\n{1}".format( 

58 name, "\n".join(lines[:end])) 

59 raise RuntimeError(mes) from e 

60 res[name] = df 

61 return res if len(res) > 1 else list(res.values())[0] 

62 else: 

63 return pandas.read_csv( # pragma: no cover 

64 filepath_or_buffer, compression=compression, **params)