Source code for pyensae.mlhelper.missing

# -*- coding: utf-8 -*-
"""
Missing values and pandas.


:githublink:`%|py|6`
"""
import pandas
import numpy
from .joins import df_crossjoin


[docs]def add_missing_indices(df, column, all_values, values=None, fillvalue=numpy.nan): """ After aggregation, it usually happens that the series is sparse. This function adds rows for missing time. :param df: dataframe to extend :param column: column with time :param all_values: all the values we want :param values: columns which contain the values, the others are considered as the keys :return: new dataframe .. exref:: :title: Add missing values in one column. .. runpython:: :showcode: import pandas from pyensae.mlhelper import add_missing_indices df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}]) df2 = add_missing_indices(df, "x", [3, 4, 5, 6]) print(df2) .. runpython:: :showcode: import pandas from pyensae.mlhelper import add_missing_indices df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}]) df2 = add_missing_indices(df, "x", values=["y"], all_values=[3, 4, 5, 6]) print(df2) :githublink:`%|py|43` """ if isinstance(values, str): values = [values] if values is None or len(values) == 0: keys = [_ for _ in df.columns if _ != column] else: keys = [_ for _ in df.columns if _ not in values and _ != column] if isinstance(all_values, list): dfti = pandas.DataFrame({column: all_values}) elif isinstance(all_values, (pandas.Series, numpy.ndarray)): dfti = pandas.DataFrame({column: all_values}) elif isinstance(all_values, pandas.DataFrame): dfti = all_values if dfti.shape[1] != 1: raise ValueError("all_values should have only one column") if dfti.columns[0] != column: raise ValueError( "all_values should have only one column with name '{0}'".format(column)) else: raise TypeError( "Unexpected type for all_values '{0}'".format(type(all_values))) # Merge only happens on columns with the same type. cols = set(dfti.columns) for c in df.columns: if c in cols and dfti[c].dtype != df[c].dtype: dfti[c] = dfti[c].astype(df[c].dtype) if len(keys) == 0: dfj = df.merge(dfti, on=column, how="right") else: nkeys = keys + [column] only = df[nkeys].groupby( keys, as_index=False).count().drop(column, axis=1) dfti = df_crossjoin(only, dfti) dfj = df.merge(dfti, on=nkeys, how="right") return dfj.sort_values(column)