Source code for cpyquickhelper.fastdata.pandas2numpy

"""
Fast data manipulations.


:githublink:`%|py|5`
"""
import pandas


[docs]def df2array(df, check=True): """ Converts a dataframe into a :epkg:`numpy:array` without copying. :epkg:`pandas` is merging consecutive columns sharing the same type into one memory block. The function can be used only if the data is stored in one block and one type as a consequence. :param df: dataframe :param check: verifies the operation can be done (True) or skip verification (False) :return: :epkg:`numpy:array` See `data member <https://pandas.pydata.org/pandas-docs/stable/search.html?q=pointer&check_keywords=yes&area=default>`_, `_data <https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L322>`_. .. seealso:: :func:`df2array <cpyquickhelper.fastdata.pandas2numpy.df2array>` :githublink:`%|py|26` """ if check: if not isinstance(df, pandas.DataFrame): raise TypeError("df is not a pandas.DataFrame") # pragma: no cover if len(df._data.blocks) != 1: raise ValueError( "The dataframe has many block of data. There should be only one column type.") return df._data.blocks[0].values
[docs]def df2arrays(df, sep=",", check=True): """ Converts a dataframe into a list of a list of tuple *(column name, :epkg:`numpy:array`)* without copying. :epkg:`pandas` is merging consecutive columns sharing the same type into one memory block. That's what the function extracts :param df: dataframe :param check: verifies the operation can be done (True) or skip verification (False) :param sep: columns separator :return: a list of tuple ``(column, array)`` Example: .. runpython:: :showcode: from pandas import DataFrame from cpyquickhelper.fastdata import df2arrays df = DataFrame([dict(a=3.4, b=5.6, c="e"), dict(a=3.5, b=5.7, c="r")]) arr = df2arrays(df) print(arr) .. seealso:: :func:`df2array <cpyquickhelper.fastdata.pandas2numpy.df2array>` :githublink:`%|py|65` """ if check: if not isinstance(df, pandas.DataFrame): raise TypeError("df is not a pandas.DataFrame") # pragma: no cover cols = df.columns res = [] pos = 0 for b in df._data.blocks: name = sep.join(cols[pos:pos + b.shape[1]]) res.append((name, b.values)) pos += b.shape[1] return res