Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Missing values and pandas. 

5""" 

6import pandas 

7import numpy 

8from .joins import df_crossjoin 

9 

10 

11def add_missing_indices(df, column, all_values, values=None, fillvalue=numpy.nan): 

12 """ 

13 After aggregation, it usually happens that the series is sparse. 

14 This function adds rows for missing time. 

15 

16 @param df dataframe to extend 

17 @param column column with time 

18 @param all_values all the values we want 

19 @param values columns which contain the values, the others are considered as the keys 

20 @return new dataframe 

21 

22 .. exref:: 

23 :title: Add missing values in one column. 

24 

25 .. runpython:: 

26 :showcode: 

27 

28 import pandas 

29 from pyensae.mlhelper import add_missing_indices 

30 df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}]) 

31 df2 = add_missing_indices(df, "x", [3, 4, 5, 6]) 

32 print(df2) 

33 

34 .. runpython:: 

35 :showcode: 

36 

37 import pandas 

38 from pyensae.mlhelper import add_missing_indices 

39 df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}]) 

40 df2 = add_missing_indices(df, "x", values=["y"], all_values=[3, 4, 5, 6]) 

41 print(df2) 

42 

43 """ 

44 if isinstance(values, str): 

45 values = [values] 

46 if values is None or len(values) == 0: 

47 keys = [_ for _ in df.columns if _ != column] 

48 else: 

49 keys = [_ for _ in df.columns if _ not in values and _ != column] 

50 if isinstance(all_values, list): 

51 dfti = pandas.DataFrame({column: all_values}) 

52 elif isinstance(all_values, (pandas.Series, numpy.ndarray)): 

53 dfti = pandas.DataFrame({column: all_values}) 

54 elif isinstance(all_values, pandas.DataFrame): 

55 dfti = all_values 

56 if dfti.shape[1] != 1: 

57 raise ValueError("all_values should have only one column") 

58 if dfti.columns[0] != column: 

59 raise ValueError( 

60 "all_values should have only one column with name '{0}'".format(column)) 

61 else: 

62 raise TypeError( 

63 "Unexpected type for all_values '{0}'".format(type(all_values))) 

64 

65 # Merge only happens on columns with the same type. 

66 cols = set(dfti.columns) 

67 for c in df.columns: 

68 if c in cols and dfti[c].dtype != df[c].dtype: 

69 dfti[c] = dfti[c].astype(df[c].dtype) 

70 

71 if len(keys) == 0: 

72 dfj = df.merge(dfti, on=column, how="right") 

73 else: 

74 nkeys = keys + [column] 

75 only = df[nkeys].groupby( 

76 keys, as_index=False).count().drop(column, axis=1) 

77 dfti = df_crossjoin(only, dfti) 

78 dfj = df.merge(dfti, on=nkeys, how="right") 

79 return dfj.sort_values(column)