Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Fast data manipulations. 

4""" 

5import pandas 

6 

7 

8def df2array(df, check=True): 

9 """ 

10 Converts a dataframe into a :epkg:`numpy:array` 

11 without copying. :epkg:`pandas` is merging 

12 consecutive columns sharing the same type 

13 into one memory block. The function can be used 

14 only if the data is stored in one block and one type 

15 as a consequence. 

16 

17 @param df dataframe 

18 @param check verifies the operation can be done (True) 

19 or skip verification (False) 

20 @return :epkg:`numpy:array` 

21 

22 See `data member <https://pandas.pydata.org/pandas-docs/stable/search.html?q=pointer&check_keywords=yes&area=default>`_, 

23 `_data <https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L322>`_. 

24 

25 .. seealso:: @see fn df2array 

26 """ 

27 if check: 

28 if not isinstance(df, pandas.DataFrame): 

29 raise TypeError("df is not a pandas.DataFrame") # pragma: no cover 

30 if len(df._data.blocks) != 1: 

31 raise ValueError( 

32 "The dataframe has many block of data. There should be only one column type.") 

33 return df._data.blocks[0].values 

34 

35 

36def df2arrays(df, sep=",", check=True): 

37 """ 

38 Converts a dataframe into a list of 

39 a list of tuple *(column name, :epkg:`numpy:array`)* 

40 without copying. :epkg:`pandas` is merging 

41 consecutive columns sharing the same type 

42 into one memory block. That's what the function extracts 

43 

44 @param df dataframe 

45 @param check verifies the operation can be done (True) 

46 or skip verification (False) 

47 @param sep columns separator 

48 @return a list of tuple ``(column, array)`` 

49 

50 Example: 

51 

52 .. runpython:: 

53 :showcode: 

54 

55 from pandas import DataFrame 

56 from cpyquickhelper.fastdata import df2arrays 

57 

58 df = DataFrame([dict(a=3.4, b=5.6, c="e"), 

59 dict(a=3.5, b=5.7, c="r")]) 

60 arr = df2arrays(df) 

61 print(arr) 

62 

63 

64 .. seealso:: @see fn df2array 

65 """ 

66 if check: 

67 if not isinstance(df, pandas.DataFrame): 

68 raise TypeError("df is not a pandas.DataFrame") # pragma: no cover 

69 

70 cols = df.columns 

71 res = [] 

72 pos = 0 

73 for b in df._data.blocks: 

74 name = sep.join(cols[pos:pos + b.shape[1]]) 

75 res.append((name, b.values)) 

76 pos += b.shape[1] 

77 return res