Source code for pyensae.mlhelper.missing
# -*- coding: utf-8 -*-
"""
Missing values and pandas.
:githublink:`%|py|6`
"""
import pandas
import numpy
from .joins import df_crossjoin
[docs]def add_missing_indices(df, column, all_values, values=None, fillvalue=numpy.nan):
"""
After aggregation, it usually happens that the series is sparse.
This function adds rows for missing time.
:param df: dataframe to extend
:param column: column with time
:param all_values: all the values we want
:param values: columns which contain the values, the others are considered as the keys
:return: new dataframe
.. exref::
:title: Add missing values in one column.
.. runpython::
:showcode:
import pandas
from pyensae.mlhelper import add_missing_indices
df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}])
df2 = add_missing_indices(df, "x", [3, 4, 5, 6])
print(df2)
.. runpython::
:showcode:
import pandas
from pyensae.mlhelper import add_missing_indices
df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}])
df2 = add_missing_indices(df, "x", values=["y"], all_values=[3, 4, 5, 6])
print(df2)
:githublink:`%|py|43`
"""
if isinstance(values, str):
values = [values]
if values is None or len(values) == 0:
keys = [_ for _ in df.columns if _ != column]
else:
keys = [_ for _ in df.columns if _ not in values and _ != column]
if isinstance(all_values, list):
dfti = pandas.DataFrame({column: all_values})
elif isinstance(all_values, (pandas.Series, numpy.ndarray)):
dfti = pandas.DataFrame({column: all_values})
elif isinstance(all_values, pandas.DataFrame):
dfti = all_values
if dfti.shape[1] != 1:
raise ValueError("all_values should have only one column")
if dfti.columns[0] != column:
raise ValueError(
"all_values should have only one column with name '{0}'".format(column))
else:
raise TypeError(
"Unexpected type for all_values '{0}'".format(type(all_values)))
# Merge only happens on columns with the same type.
cols = set(dfti.columns)
for c in df.columns:
if c in cols and dfti[c].dtype != df[c].dtype:
dfti[c] = dfti[c].astype(df[c].dtype)
if len(keys) == 0:
dfj = df.merge(dfti, on=column, how="right")
else:
nkeys = keys + [column]
only = df[nkeys].groupby(
keys, as_index=False).count().drop(column, axis=1)
dfti = df_crossjoin(only, dfti)
dfj = df.merge(dfti, on=nkeys, how="right")
return dfj.sort_values(column)