Source code for pyensae.mlhelper.table_formula
# -*- coding: utf-8 -*-
"""
Adds functionalities to a dataframe.
:githublink:`%|py|6`
"""
import datetime
import pandas
[docs]class TableFormula(pandas.DataFrame): # pylint: disable=W0223
"""
Extends class :epkg:`pandas:DataFrame` or proposes extensions
to existing functions using lambda functions.
See `Extending Pandas
<https://pandas.pydata.org/pandas-docs/
stable/development/extending.html>`_.
:githublink:`%|py|17`
"""
@property
def _constructor(self):
return TableFormula
[docs] def sort(self, function_sort, reverse=False):
"""
Sorts rows based on the values returned by *function_sort*.
:param function_sort: lambda function
:param reverse: reverse order
The function creates a column ``__key__`` and removes it later.
The changes happen inplace.
:githublink:`%|py|32`
"""
if "__key__" in self.columns:
raise ValueError(
"__key__ cannot be used in the original dataframe.")
self["__key__"] = self.apply(lambda row: function_sort(row), axis=1)
self.sort_values("__key__", inplace=True, ascending=not reverse)
self.drop("__key__", inplace=True, axis=1)
[docs] def fgroupby(self, function_key, function_values, columns=None,
function_agg=None, function_weight=None):
"""
Groups information based on columns defined by lambda functions.
:param function_key: defines the key
:param function_values: defines the values
:param columns: name of the columns, if None, new ones will be created
:param function_agg: how to aggregate the data, if None, the default is
:epkg:`pandas:DataFrame:sum`.
:param function_weight: defines weights, can be None
The function uses columns ``__key__``, ``__weight__``.
You should not use these names.
Others columns are created ``__value_{0}__`` and
``__weight_{0}__``. All of them are created and removed
before returning the result.
Example:
::
group = table.groupby(lambda v: v["name"],
[lambda v: v["d_a"]],
["sum_d_a"],
[lambda vec, w: sum(vec) / w],
lambda v: v["d_b"])
:githublink:`%|py|67`
"""
if "__key__" in self.columns:
raise ValueError(
"__key__ cannot be used in the original dataframe.")
if "__weight__" in self.columns:
raise ValueError(
"__weight__ cannot be used in the original dataframe.")
cp = self.copy()
cp["__key__"] = cp.apply(lambda row: function_key(row), axis=1)
if function_weight is not None:
cp["__weight__"] = cp.apply(
lambda row: function_weight(row), axis=1)
if columns is None:
columns = ["fv{0}" for i in range(len(function_values))]
if len(columns) != len(function_values):
raise ValueError(
"Parameters function_values and columns must have the same size.")
if function_agg is None:
function_agg = [pandas.DataFrame.sum for c in columns]
if len(function_agg) != len(function_values):
raise ValueError(
"Parameters function_values and function_agg must have the same size.")
values = []
rep = dict()
for v, cnew in zip(function_values, columns):
n = "__value_{0}__".format(cnew)
values.append(n)
rep[n] = cnew
if function_weight is None:
cp[n] = cp.apply(lambda row, v=v: v(row), axis=1)
else:
cp[n] = cp.apply(lambda row, v=v: v(
row), axis=1) * cp["__weight__"]
if function_weight is None:
aggs = {k: v for k, v in zip( # pylint: disable=R1721
values, function_agg)} # pylint: disable=R1721
gr = cp.groupby("__key__", as_index=False).agg(aggs)
else:
sum_weight = cp["__weight__"].sum()
aggs = {k: (lambda c, v=v: v(c, sum_weight)) # pylint: disable=W0631
for k, v in zip(values, function_agg)}
gr = cp.groupby("__key__", as_index=False).agg(aggs)
gr.columns = [rep.get(_, _) for _ in gr.columns]
gr = gr.drop("__key__", axis=1)
return TableFormula(gr)
[docs] def add_column_index(self, index, name=None):
"""
Changes the index.
:param index: new_index
:param name: name of the index
The changes happen inplace.
:githublink:`%|py|125`
"""
self["__key__"] = index
self.set_index("__key__", inplace=True)
self.index.rename(name, inplace=True)
[docs] def add_column_vector(self, name, values):
"""
Adds a column knowing its name and a vector of values.
:param name: name of the column
:param values: values
The changes happen inplace.
:githublink:`%|py|138`
"""
self[name] = values
[docs] def addc(self, name, function_value):
"""
Adds a column knowing its name and a lambda function.
:param name: name of the column
:param function_value: function
The changes happen inplace.
:githublink:`%|py|149`
"""
self[name] = self.apply(lambda row: function_value(row), axis=1)
[docs] def graph_XY(self, curves, xlabel=None, ylabel=None, marker=True,
link_point=False, title=None, format_date="%Y-%m-%d",
legend_loc=0, figsize=None, ax=None):
"""
:param curves: list of 3-uples (generator for X, generator for Y, label)
for some layout, it can also be:
(generator for X, generator for Y, generator for labels, label)
:param xlabel: label for X axis
:param ylabel: label for Y axis
:param marker: add a marker for each point
:param link_point: link points between them
:param title: graph title
:param format_date: if X axis is a datetime object, the function will use this format
to print dates
:param legend_loc: location of the legend
:param figsize: size of the figure
:param ax: :epkg:`matplotlib:Axis` or None to create a new one
:return: :epkg:`matplotlib:Axis`
For the legend position, see `matplotlib <http://matplotlib.org/api/legend_api.html>`_.
Example:
::
table.graph_XY ( [ [ lambda v: v["sum_a"], lambda v: v["sum_b"], "xy label 1"],
[ lambda v: v["sum_b"], lambda v: v["sum_c"], "xy label 2"],
])
:githublink:`%|py|180`
"""
if ax is None:
import matplotlib.pyplot as plt # pylint: disable=C0415
fig, ax = plt.subplots(1, 1, figsize=figsize)
smarker = {(True, True): 'o-', (True, False): 'o', (False, True): '-',
# (False, False) :''
}[marker, link_point]
has_date = False
for xf, yf, label in curves:
x = self.apply(xf, axis=1)
y = self.apply(yf, axis=1)
if isinstance(x[0], datetime.datetime):
import matplotlib.dates # pylint: disable=C0415
x = [matplotlib.dates.date2num(d) for d in x]
has_date = True
ax.plot(x, y, smarker, label=label)
if has_date:
import matplotlib.dates # pylint: disable=C0415
hfmt = matplotlib.dates.DateFormatter(format_date)
if "%H" in format_date or "%M" in format_date:
ax.xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
ax.xaxis.set_major_formatter(hfmt)
fig = ax.get_figure()
fig.autofmt_xdate()
ax.legend(loc=legend_loc)
return ax