Source code for pyensae.mlhelper.table_formula

# -*- coding: utf-8 -*-
"""
Adds functionalities to a dataframe.


:githublink:`%|py|6`
"""
import datetime
import pandas


[docs]class TableFormula(pandas.DataFrame): # pylint: disable=W0223 """ Extends class :epkg:`pandas:DataFrame` or proposes extensions to existing functions using lambda functions. See `Extending Pandas <https://pandas.pydata.org/pandas-docs/ stable/development/extending.html>`_. :githublink:`%|py|17` """ @property def _constructor(self): return TableFormula
[docs] def sort(self, function_sort, reverse=False): """ Sorts rows based on the values returned by *function_sort*. :param function_sort: lambda function :param reverse: reverse order The function creates a column ``__key__`` and removes it later. The changes happen inplace. :githublink:`%|py|32` """ if "__key__" in self.columns: raise ValueError( "__key__ cannot be used in the original dataframe.") self["__key__"] = self.apply(lambda row: function_sort(row), axis=1) self.sort_values("__key__", inplace=True, ascending=not reverse) self.drop("__key__", inplace=True, axis=1)
[docs] def fgroupby(self, function_key, function_values, columns=None, function_agg=None, function_weight=None): """ Groups information based on columns defined by lambda functions. :param function_key: defines the key :param function_values: defines the values :param columns: name of the columns, if None, new ones will be created :param function_agg: how to aggregate the data, if None, the default is :epkg:`pandas:DataFrame:sum`. :param function_weight: defines weights, can be None The function uses columns ``__key__``, ``__weight__``. You should not use these names. Others columns are created ``__value_{0}__`` and ``__weight_{0}__``. All of them are created and removed before returning the result. Example: :: group = table.groupby(lambda v: v["name"], [lambda v: v["d_a"]], ["sum_d_a"], [lambda vec, w: sum(vec) / w], lambda v: v["d_b"]) :githublink:`%|py|67` """ if "__key__" in self.columns: raise ValueError( "__key__ cannot be used in the original dataframe.") if "__weight__" in self.columns: raise ValueError( "__weight__ cannot be used in the original dataframe.") cp = self.copy() cp["__key__"] = cp.apply(lambda row: function_key(row), axis=1) if function_weight is not None: cp["__weight__"] = cp.apply( lambda row: function_weight(row), axis=1) if columns is None: columns = ["fv{0}" for i in range(len(function_values))] if len(columns) != len(function_values): raise ValueError( "Parameters function_values and columns must have the same size.") if function_agg is None: function_agg = [pandas.DataFrame.sum for c in columns] if len(function_agg) != len(function_values): raise ValueError( "Parameters function_values and function_agg must have the same size.") values = [] rep = dict() for v, cnew in zip(function_values, columns): n = "__value_{0}__".format(cnew) values.append(n) rep[n] = cnew if function_weight is None: cp[n] = cp.apply(lambda row, v=v: v(row), axis=1) else: cp[n] = cp.apply(lambda row, v=v: v( row), axis=1) * cp["__weight__"] if function_weight is None: aggs = {k: v for k, v in zip( # pylint: disable=R1721 values, function_agg)} # pylint: disable=R1721 gr = cp.groupby("__key__", as_index=False).agg(aggs) else: sum_weight = cp["__weight__"].sum() aggs = {k: (lambda c, v=v: v(c, sum_weight)) # pylint: disable=W0631 for k, v in zip(values, function_agg)} gr = cp.groupby("__key__", as_index=False).agg(aggs) gr.columns = [rep.get(_, _) for _ in gr.columns] gr = gr.drop("__key__", axis=1) return TableFormula(gr)
[docs] def add_column_index(self, index, name=None): """ Changes the index. :param index: new_index :param name: name of the index The changes happen inplace. :githublink:`%|py|125` """ self["__key__"] = index self.set_index("__key__", inplace=True) self.index.rename(name, inplace=True)
[docs] def add_column_vector(self, name, values): """ Adds a column knowing its name and a vector of values. :param name: name of the column :param values: values The changes happen inplace. :githublink:`%|py|138` """ self[name] = values
[docs] def addc(self, name, function_value): """ Adds a column knowing its name and a lambda function. :param name: name of the column :param function_value: function The changes happen inplace. :githublink:`%|py|149` """ self[name] = self.apply(lambda row: function_value(row), axis=1)
[docs] def graph_XY(self, curves, xlabel=None, ylabel=None, marker=True, link_point=False, title=None, format_date="%Y-%m-%d", legend_loc=0, figsize=None, ax=None): """ :param curves: list of 3-uples (generator for X, generator for Y, label) for some layout, it can also be: (generator for X, generator for Y, generator for labels, label) :param xlabel: label for X axis :param ylabel: label for Y axis :param marker: add a marker for each point :param link_point: link points between them :param title: graph title :param format_date: if X axis is a datetime object, the function will use this format to print dates :param legend_loc: location of the legend :param figsize: size of the figure :param ax: :epkg:`matplotlib:Axis` or None to create a new one :return: :epkg:`matplotlib:Axis` For the legend position, see `matplotlib <http://matplotlib.org/api/legend_api.html>`_. Example: :: table.graph_XY ( [ [ lambda v: v["sum_a"], lambda v: v["sum_b"], "xy label 1"], [ lambda v: v["sum_b"], lambda v: v["sum_c"], "xy label 2"], ]) :githublink:`%|py|180` """ if ax is None: import matplotlib.pyplot as plt # pylint: disable=C0415 fig, ax = plt.subplots(1, 1, figsize=figsize) smarker = {(True, True): 'o-', (True, False): 'o', (False, True): '-', # (False, False) :'' }[marker, link_point] has_date = False for xf, yf, label in curves: x = self.apply(xf, axis=1) y = self.apply(yf, axis=1) if isinstance(x[0], datetime.datetime): import matplotlib.dates # pylint: disable=C0415 x = [matplotlib.dates.date2num(d) for d in x] has_date = True ax.plot(x, y, smarker, label=label) if has_date: import matplotlib.dates # pylint: disable=C0415 hfmt = matplotlib.dates.DateFormatter(format_date) if "%H" in format_date or "%M" in format_date: ax.xaxis.set_major_locator(matplotlib.dates.MinuteLocator()) ax.xaxis.set_major_formatter(hfmt) fig = ax.get_figure() fig.autofmt_xdate() ax.legend(loc=legend_loc) return ax