Coverage for src/pymlbenchmark/benchmark/bench_helper.py: 85%

68 statements  

« prev     ^ index     » next       coverage.py v7.2.1, created at 2023-03-08 00:27 +0100

1""" 

2@file 

3@brief Implements a benchmark about performance. 

4""" 

5import pandas 

6from pandas.api.types import is_numeric_dtype 

7 

8 

9def enumerate_options(options, filter_fct=None): 

10 """ 

11 Enumerates all possible options. 

12 

13 @param options dictionary ``{name: list of values}`` 

14 @param filter_fct filters out some configurations 

15 @return list of dictionary ``{name: value}`` 

16 

17 .. runpython:: 

18 :showcode: 

19 

20 from pymlbenchmark.benchmark.bench_helper import enumerate_options 

21 options = dict(c1=[0, 1], c2=["aa", "bb"]) 

22 for row in enumerate_options(options): 

23 print("no-filter", row) 

24 

25 def filter_out(**opt): 

26 return not (opt["c1"] == 1 and opt["c2"] == "aa") 

27 

28 for row in enumerate_options(options, filter_out): 

29 print("filter", row) 

30 """ 

31 keys = list(sorted(options)) 

32 mx = [len(options[k]) for k in keys] 

33 if min(mx) == 0: 

34 mi = min(zip(mx, keys)) # pragma: no cover 

35 raise ValueError( # pragma: no cover 

36 "Parameter '{0}' has no values.".format(mi[1])) 

37 pos = [0 for _ in keys] 

38 while pos[0] < mx[0]: 

39 opts = {k: options[k][pos[i]] for i, k in enumerate(keys)} 

40 if filter_fct is None or filter_fct(**opts): 

41 yield opts 

42 p = len(pos) - 1 

43 pos[p] += 1 

44 while p > 0 and pos[p] >= mx[p]: 

45 pos[p] = 0 

46 p -= 1 

47 pos[p] += 1 

48 

49 

50def bench_pivot(data, experiment='lib', value='mean', index=None): 

51 """ 

52 Merges all results for one set of parameters in one row. 

53 

54 @param data :epkg:`DataFrame` 

55 @param experiment column which identifies an experiment 

56 @param value value to plot 

57 @param index set of parameters which identifies 

58 an experiment, if None, guesses it 

59 @return :epkg:`DataFrame` 

60 

61 .. runpython:: 

62 :showcode: 

63 

64 import pandas 

65 from pymlbenchmark.datasets import experiment_results 

66 from pymlbenchmark.benchmark.bench_helper import bench_pivot 

67 

68 df = experiment_results('onnxruntime_LogisticRegression') 

69 piv = bench_pivot(df) 

70 print(piv.head()) 

71 """ 

72 if not isinstance(experiment, list): 

73 experiment = [experiment] 

74 if index is None: 

75 metrics = ['lower', 'max', 'max3', 'mean', 

76 'median', 'min', 'min3', 'repeat', 'upper'] 

77 data = data.copy() 

78 nonan = [] 

79 for c in data.columns: 

80 if c in metrics or c in experiment: 

81 continue 

82 if c.endswith('_nodes') or c.endswith('_size'): 

83 continue 

84 nn = sum(data[c].isnull()) 

85 if nn == data.shape[0]: 

86 continue 

87 if nn == 0: 

88 nonan.append(c) 

89 continue 

90 if is_numeric_dtype(data[c]): 

91 data[c].fillna(-1, inplace=True) 

92 else: 

93 data[c].fillna("", inplace=True) 

94 nonan.append(c) 

95 index = nonan 

96 keep = list(index) 

97 if isinstance(value, str): 

98 keep.append(value) 

99 else: 

100 keep.extend(value) 

101 keep.extend(experiment) 

102 for c in keep: 

103 if c not in data.columns: 

104 raise ValueError( # pragma: no cover 

105 "Unable to find '{}' in {}.".format(c, data.columns)) 

106 data_short = data[keep] 

107 gr = data_short.groupby(index + experiment).count() 

108 if gr[value].max() >= 2: 

109 gr = gr[gr[value] > 1] 

110 raise ValueError( # pragma: no cover 

111 "The set of parameters does not identify an experiment." 

112 "\nindex: {}\nexperiment: {}\nvalue: {}\ncolumns: {}\n--\n{}".format( 

113 index, experiment, value, data.columns, gr[gr[value] >= 2])) 

114 piv = pandas.pivot_table(data_short, values=value, index=index, columns=experiment, 

115 aggfunc='mean', dropna=False) 

116 return piv 

117 

118 

119def remove_almost_nan_columns(df, keep=None, fill_keep=True): 

120 """ 

121 Automatically removes columns with more than 1/3 

122 nan values. 

123 

124 @param df dataframe 

125 @param keep columns to skip 

126 @param fill_keep if not None, fill nan value 

127 @return clean dataframe 

128 """ 

129 if keep is None: 

130 keep = set() 

131 n = df.shape[0] * 1 // 3 

132 nanc = [c for c in df.columns if c not in keep and sum( 

133 df[c].isnull()) >= n] 

134 if keep and fill_keep: 

135 df = df.copy() 

136 for c in keep: 

137 if is_numeric_dtype(df[c]): 

138 df[c].fillna(-1, inplace=True) 

139 else: 

140 df[c] = df[c].astype(str) 

141 df[c].fillna("", inplace=True) 

142 if nanc: 

143 return df.drop(nanc, axis=1) 

144 return df