Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Implements a benchmark about performance.
4"""
5import pandas
6from pandas.api.types import is_numeric_dtype
9def enumerate_options(options, filter_fct=None):
10 """
11 Enumerates all possible options.
13 @param options dictionary ``{name: list of values}``
14 @param filter_fct filters out some configurations
15 @return list of dictionary ``{name: value}``
17 .. runpython::
18 :showcode:
20 from pymlbenchmark.benchmark.bench_helper import enumerate_options
21 options = dict(c1=[0, 1], c2=["aa", "bb"])
22 for row in enumerate_options(options):
23 print("no-filter", row)
25 def filter_out(**opt):
26 return not (opt["c1"] == 1 and opt["c2"] == "aa")
28 for row in enumerate_options(options, filter_out):
29 print("filter", row)
30 """
31 keys = list(sorted(options))
32 mx = [len(options[k]) for k in keys]
33 if min(mx) == 0:
34 mi = min(zip(mx, keys)) # pragma: no cover
35 raise ValueError( # pragma: no cover
36 "Parameter '{0}' has no values.".format(mi[1]))
37 pos = [0 for _ in keys]
38 while pos[0] < mx[0]:
39 opts = {k: options[k][pos[i]] for i, k in enumerate(keys)}
40 if filter_fct is None or filter_fct(**opts):
41 yield opts
42 p = len(pos) - 1
43 pos[p] += 1
44 while p > 0 and pos[p] >= mx[p]:
45 pos[p] = 0
46 p -= 1
47 pos[p] += 1
50def bench_pivot(data, experiment='lib', value='mean', index=None):
51 """
52 Merges all results for one set of parameters in one row.
54 @param data :epkg:`DataFrame`
55 @param experiment column which identifies an experiment
56 @param value value to plot
57 @param index set of parameters which identifies
58 an experiment, if None, guesses it
59 @return :epkg:`DataFrame`
61 .. runpython::
62 :showcode:
64 import pandas
65 from pymlbenchmark.datasets import experiment_results
66 from pymlbenchmark.benchmark.bench_helper import bench_pivot
68 df = experiment_results('onnxruntime_LogisticRegression')
69 piv = bench_pivot(df)
70 print(piv.head())
71 """
72 if not isinstance(experiment, list):
73 experiment = [experiment]
74 if index is None:
75 metrics = ['lower', 'max', 'max3', 'mean',
76 'median', 'min', 'min3', 'repeat', 'upper']
77 data = data.copy()
78 nonan = []
79 for c in data.columns:
80 if c in metrics or c in experiment:
81 continue
82 if c.endswith('_nodes') or c.endswith('_size'):
83 continue
84 nn = sum(data[c].isnull())
85 if nn == data.shape[0]:
86 continue
87 if nn == 0:
88 nonan.append(c)
89 continue
90 if is_numeric_dtype(data[c]):
91 data[c].fillna(-1, inplace=True)
92 else:
93 data[c].fillna("", inplace=True)
94 nonan.append(c)
95 index = nonan
96 keep = list(index)
97 if isinstance(value, str):
98 keep.append(value)
99 else:
100 keep.extend(value)
101 keep.extend(experiment)
102 for c in keep:
103 if c not in data.columns:
104 raise ValueError( # pragma: no cover
105 "Unable to find '{}' in {}.".format(c, data.columns))
106 data_short = data[keep]
107 gr = data_short.groupby(index + experiment).count()
108 if gr[value].max() >= 2:
109 gr = gr[gr[value] > 1]
110 raise ValueError( # pragma: no cover
111 "The set of parameters does not identify an experiment."
112 "\nindex: {}\nexperiment: {}\nvalue: {}\ncolumns: {}\n--\n{}".format(
113 index, experiment, value, data.columns, gr[gr[value] >= 2]))
114 piv = pandas.pivot_table(data_short, values=value, index=index, columns=experiment,
115 aggfunc='mean', dropna=False)
116 return piv
119def remove_almost_nan_columns(df, keep=None, fill_keep=True):
120 """
121 Automatically removes columns with more than 1/3
122 nan values.
124 @param df dataframe
125 @param keep columns to skip
126 @param fill_keep if not None, fill nan value
127 @return clean dataframe
128 """
129 if keep is None:
130 keep = set()
131 n = df.shape[0] * 1 // 3
132 nanc = [c for c in df.columns if c not in keep and sum(
133 df[c].isnull()) >= n]
134 if keep and fill_keep:
135 df = df.copy()
136 for c in keep:
137 if is_numeric_dtype(df[c]):
138 df[c].fillna(-1, inplace=True)
139 else:
140 df[c] = df[c].astype(str)
141 df[c].fillna("", inplace=True)
142 if nanc:
143 return df.drop(nanc, axis=1)
144 return df