Coverage for src/pymlbenchmark/benchmark/bench

1"""

2@file

3@brief Implements a benchmark about performance.

4"""

5import pandas

6from pandas.api.types import is_numeric_dtype

9def enumerate_options(options, filter_fct=None):

10 """

11 Enumerates all possible options.

13 @param options dictionary ``{name: list of values}``

14 @param filter_fct filters out some configurations

15 @return list of dictionary ``{name: value}``

17 .. runpython::

18 :showcode:

20 from pymlbenchmark.benchmark.bench_helper import enumerate_options

21 options = dict(c1=[0, 1], c2=["aa", "bb"])

22 for row in enumerate_options(options):

23 print("no-filter", row)

25 def filter_out(**opt):

26 return not (opt["c1"] == 1 and opt["c2"] == "aa")

28 for row in enumerate_options(options, filter_out):

29 print("filter", row)

30 """

31 keys = list(sorted(options))

32 mx = [len(options[k]) for k in keys]

33 if min(mx) == 0:

34 mi = min(zip(mx, keys)) # pragma: no cover

35 raise ValueError( # pragma: no cover

36 "Parameter '{0}' has no values.".format(mi[1]))

37 pos = [0 for _ in keys]

38 while pos[0] < mx[0]:

39 opts = {k: options[k][pos[i]] for i, k in enumerate(keys)}

40 if filter_fct is None or filter_fct(**opts):

41 yield opts

42 p = len(pos) - 1

43 pos[p] += 1

44 while p > 0 and pos[p] >= mx[p]:

45 pos[p] = 0

46 p -= 1

47 pos[p] += 1

50def bench_pivot(data, experiment='lib', value='mean', index=None):

51 """

52 Merges all results for one set of parameters in one row.

54 @param data :epkg:`DataFrame`

55 @param experiment column which identifies an experiment

56 @param value value to plot

57 @param index set of parameters which identifies

58 an experiment, if None, guesses it

59 @return :epkg:`DataFrame`

61 .. runpython::

62 :showcode:

64 import pandas

65 from pymlbenchmark.datasets import experiment_results

66 from pymlbenchmark.benchmark.bench_helper import bench_pivot

68 df = experiment_results('onnxruntime_LogisticRegression')

69 piv = bench_pivot(df)

70 print(piv.head())

71 """

72 if not isinstance(experiment, list):

73 experiment = [experiment]

74 if index is None:

75 metrics = ['lower', 'max', 'max3', 'mean',

76 'median', 'min', 'min3', 'repeat', 'upper']

77 data = data.copy()

78 nonan = []

79 for c in data.columns:

80 if c in metrics or c in experiment:

81 continue

82 if c.endswith('_nodes') or c.endswith('_size'):

83 continue

84 nn = sum(data[c].isnull())

85 if nn == data.shape[0]:

86 continue

87 if nn == 0:

88 nonan.append(c)

89 continue

90 if is_numeric_dtype(data[c]):

91 data[c].fillna(-1, inplace=True)

92 else:

93 data[c].fillna("", inplace=True)

94 nonan.append(c)

95 index = nonan

96 keep = list(index)

97 if isinstance(value, str):

98 keep.append(value)

99 else:

100 keep.extend(value)

101 keep.extend(experiment)

102 for c in keep:

103 if c not in data.columns:

104 raise ValueError( # pragma: no cover

105 "Unable to find '{}' in {}.".format(c, data.columns))

106 data_short = data[keep]

107 gr = data_short.groupby(index + experiment).count()

108 if gr[value].max() >= 2:

109 gr = gr[gr[value] > 1]

110 raise ValueError( # pragma: no cover

111 "The set of parameters does not identify an experiment."

112 "\nindex: {}\nexperiment: {}\nvalue: {}\ncolumns: {}\n--\n{}".format(

113 index, experiment, value, data.columns, gr[gr[value] >= 2]))

114 piv = pandas.pivot_table(data_short, values=value, index=index, columns=experiment,

115 aggfunc='mean', dropna=False)

116 return piv

117

118

119def remove_almost_nan_columns(df, keep=None, fill_keep=True):

120 """

121 Automatically removes columns with more than 1/3

122 nan values.

123

124 @param df dataframe

125 @param keep columns to skip

126 @param fill_keep if not None, fill nan value

127 @return clean dataframe

128 """

129 if keep is None:

130 keep = set()

131 n = df.shape[0] * 1 // 3

132 nanc = [c for c in df.columns if c not in keep and sum(

133 df[c].isnull()) >= n]

134 if keep and fill_keep:

135 df = df.copy()

136 for c in keep:

137 if is_numeric_dtype(df[c]):

138 df[c].fillna(-1, inplace=True)

139 else:

140 df[c] = df[c].astype(str)

141 df[c].fillna("", inplace=True)

142 if nanc:

143 return df.drop(nanc, axis=1)

144 return df

Coverage for src/pymlbenchmark/benchmark/bench_helper.py: 85%

68 statements