Source code for cpyquickhelper.fastdata.pandas2numpy
"""
Fast data manipulations.
:githublink:`%|py|5`
"""
import pandas
[docs]def df2array(df, check=True):
"""
Converts a dataframe into a :epkg:`numpy:array`
without copying. :epkg:`pandas` is merging
consecutive columns sharing the same type
into one memory block. The function can be used
only if the data is stored in one block and one type
as a consequence.
:param df: dataframe
:param check: verifies the operation can be done (True)
or skip verification (False)
:return: :epkg:`numpy:array`
See `data member <https://pandas.pydata.org/pandas-docs/stable/search.html?q=pointer&check_keywords=yes&area=default>`_,
`_data <https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L322>`_.
.. seealso:: :func:`df2array <cpyquickhelper.fastdata.pandas2numpy.df2array>`
:githublink:`%|py|26`
"""
if check:
if not isinstance(df, pandas.DataFrame):
raise TypeError("df is not a pandas.DataFrame") # pragma: no cover
if len(df._data.blocks) != 1:
raise ValueError(
"The dataframe has many block of data. There should be only one column type.")
return df._data.blocks[0].values
[docs]def df2arrays(df, sep=",", check=True):
"""
Converts a dataframe into a list of
a list of tuple *(column name, :epkg:`numpy:array`)*
without copying. :epkg:`pandas` is merging
consecutive columns sharing the same type
into one memory block. That's what the function extracts
:param df: dataframe
:param check: verifies the operation can be done (True)
or skip verification (False)
:param sep: columns separator
:return: a list of tuple ``(column, array)``
Example:
.. runpython::
:showcode:
from pandas import DataFrame
from cpyquickhelper.fastdata import df2arrays
df = DataFrame([dict(a=3.4, b=5.6, c="e"),
dict(a=3.5, b=5.7, c="r")])
arr = df2arrays(df)
print(arr)
.. seealso:: :func:`df2array <cpyquickhelper.fastdata.pandas2numpy.df2array>`
:githublink:`%|py|65`
"""
if check:
if not isinstance(df, pandas.DataFrame):
raise TypeError("df is not a pandas.DataFrame") # pragma: no cover
cols = df.columns
res = []
pos = 0
for b in df._data.blocks:
name = sep.join(cols[pos:pos + b.shape[1]])
res.append((name, b.values))
pos += b.shape[1]
return res