Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Missing values and pandas.
5"""
6import pandas
7import numpy
8from .joins import df_crossjoin
11def add_missing_indices(df, column, all_values, values=None, fillvalue=numpy.nan):
12 """
13 After aggregation, it usually happens that the series is sparse.
14 This function adds rows for missing time.
16 @param df dataframe to extend
17 @param column column with time
18 @param all_values all the values we want
19 @param values columns which contain the values, the others are considered as the keys
20 @return new dataframe
22 .. exref::
23 :title: Add missing values in one column.
25 .. runpython::
26 :showcode:
28 import pandas
29 from pyensae.mlhelper import add_missing_indices
30 df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}])
31 df2 = add_missing_indices(df, "x", [3, 4, 5, 6])
32 print(df2)
34 .. runpython::
35 :showcode:
37 import pandas
38 from pyensae.mlhelper import add_missing_indices
39 df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}])
40 df2 = add_missing_indices(df, "x", values=["y"], all_values=[3, 4, 5, 6])
41 print(df2)
43 """
44 if isinstance(values, str):
45 values = [values]
46 if values is None or len(values) == 0:
47 keys = [_ for _ in df.columns if _ != column]
48 else:
49 keys = [_ for _ in df.columns if _ not in values and _ != column]
50 if isinstance(all_values, list):
51 dfti = pandas.DataFrame({column: all_values})
52 elif isinstance(all_values, (pandas.Series, numpy.ndarray)):
53 dfti = pandas.DataFrame({column: all_values})
54 elif isinstance(all_values, pandas.DataFrame):
55 dfti = all_values
56 if dfti.shape[1] != 1:
57 raise ValueError("all_values should have only one column")
58 if dfti.columns[0] != column:
59 raise ValueError(
60 "all_values should have only one column with name '{0}'".format(column))
61 else:
62 raise TypeError(
63 "Unexpected type for all_values '{0}'".format(type(all_values)))
65 # Merge only happens on columns with the same type.
66 cols = set(dfti.columns)
67 for c in df.columns:
68 if c in cols and dfti[c].dtype != df[c].dtype:
69 dfti[c] = dfti[c].astype(df[c].dtype)
71 if len(keys) == 0:
72 dfj = df.merge(dfti, on=column, how="right")
73 else:
74 nkeys = keys + [column]
75 only = df[nkeys].groupby(
76 keys, as_index=False).count().drop(column, axis=1)
77 dfti = df_crossjoin(only, dfti)
78 dfj = df.merge(dfti, on=nkeys, how="right")
79 return dfj.sort_values(column)