Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Validates runtime for many :epkg:`scikit-learn` operators.
4The submodule relies on :epkg:`onnxconverter_common`,
5:epkg:`sklearn-onnx`.
6"""
7import math
8import copy
9from timeit import Timer
10import os
11import warnings
12from importlib import import_module
13import pickle
14from time import perf_counter
15import numpy
16from sklearn.base import BaseEstimator
17from sklearn.linear_model._base import LinearModel
18from sklearn.model_selection import train_test_split
19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
20from .validate_problems import _problems
23class RuntimeBadResultsError(RuntimeError):
24 """
25 Raised when the results are too different from
26 :epkg:`scikit-learn`.
27 """
29 def __init__(self, msg, obs):
30 """
31 :param msg: to display
32 :param obs: observations
33 """
34 RuntimeError.__init__(self, msg)
35 self.obs = obs
38def _dictionary2str(di):
39 el = []
40 for k in sorted(di):
41 el.append('{}={}'.format(k, di[k]))
42 return '/'.join(el)
45def modules_list():
46 """
47 Returns modules and versions currently used.
49 .. runpython::
50 :showcode:
51 :rst:
52 :warningout: DeprecationWarning
54 from mlprodict.onnxrt.validate.validate_helper import modules_list
55 from pyquickhelper.pandashelper import df2rst
56 from pandas import DataFrame
57 print(df2rst(DataFrame(modules_list())))
58 """
59 def try_import(name):
60 try:
61 mod = import_module(name)
62 except ImportError: # pragma: no cover
63 return None
64 return (dict(name=name, version=mod.__version__)
65 if hasattr(mod, '__version__') else dict(name=name))
67 rows = []
68 for name in sorted(['pandas', 'numpy', 'sklearn', 'mlprodict',
69 'skl2onnx', 'onnxmltools', 'onnx', 'onnxruntime',
70 'scipy']):
71 res = try_import(name)
72 if res is not None:
73 rows.append(res)
74 return rows
77def _dispsimple(arr, fLOG):
78 if isinstance(arr, (tuple, list)):
79 for i, a in enumerate(arr):
80 fLOG("output %d" % i)
81 _dispsimple(a, fLOG)
82 elif hasattr(arr, 'shape'):
83 if len(arr.shape) == 1:
84 threshold = 8
85 else:
86 threshold = min(
87 50, min(50 // arr.shape[1], 8) * arr.shape[1])
88 fLOG(numpy.array2string(arr, max_line_width=120,
89 suppress_small=True,
90 threshold=threshold))
91 else: # pragma: no cover
92 s = str(arr)
93 if len(s) > 50:
94 s = s[:50] + "..."
95 fLOG(s)
98def _merge_options(all_conv_options, aoptions):
99 if aoptions is None:
100 return copy.deepcopy(all_conv_options)
101 if not isinstance(aoptions, dict):
102 return copy.deepcopy(aoptions) # pragma: no cover
103 merged = {}
104 for k, v in all_conv_options.items():
105 if k in aoptions:
106 merged[k] = _merge_options(v, aoptions[k])
107 else:
108 merged[k] = copy.deepcopy(v)
109 for k, v in aoptions.items():
110 if k in all_conv_options:
111 continue
112 merged[k] = copy.deepcopy(v)
113 return merged
116def sklearn_operators(subfolder=None, extended=False,
117 experimental=True):
118 """
119 Builds the list of operators from :epkg:`scikit-learn`.
120 The function goes through the list of submodule
121 and get the list of class which inherit from
122 :epkg:`scikit-learn:base:BaseEstimator`.
124 :param subfolder: look into only one subfolder
125 :param extended: extends the list to the list of operators
126 this package implements a converter for
127 :param experimental: includes experimental module from
128 :epkg:`scikit-learn` (see `sklearn.experimental
129 <https://github.com/scikit-learn/scikit-learn/
130 tree/master/sklearn/experimental>`_)
131 :return: the list of found operators
132 """
133 if experimental:
134 from sklearn.experimental import ( # pylint: disable=W0611
135 enable_hist_gradient_boosting,
136 enable_iterative_imputer)
138 subfolders = sklearn__all__ + ['mlprodict.onnx_conv']
139 found = []
140 for subm in sorted(subfolders):
141 if isinstance(subm, list):
142 continue # pragma: no cover
143 if subfolder is not None and subm != subfolder:
144 continue
146 if subm == 'feature_extraction':
147 subs = [subm, 'feature_extraction.text']
148 else:
149 subs = [subm]
151 for sub in subs:
152 if '.' in sub and sub not in {'feature_extraction.text'}:
153 name_sub = sub
154 else:
155 name_sub = "{0}.{1}".format("sklearn", sub)
156 try:
157 mod = import_module(name_sub)
158 except ModuleNotFoundError:
159 continue
161 if hasattr(mod, "register_converters"):
162 fct = getattr(mod, "register_converters")
163 cls = fct()
164 else:
165 cls = getattr(mod, "__all__", None)
166 if cls is None:
167 cls = list(mod.__dict__)
168 cls = [mod.__dict__[cl] for cl in cls]
170 for cl in cls:
171 try:
172 issub = issubclass(cl, BaseEstimator)
173 except TypeError:
174 continue
175 if cl.__name__ in {'Pipeline', 'ColumnTransformer',
176 'FeatureUnion', 'BaseEstimator',
177 'BaseEnsemble', 'BaseDecisionTree'}:
178 continue
179 if cl.__name__ in {'CustomScorerTransform'}:
180 continue
181 if (sub in {'calibration', 'dummy', 'manifold'} and
182 'Calibrated' not in cl.__name__):
183 continue
184 if issub:
185 pack = "sklearn" if sub in sklearn__all__ else cl.__module__.split('.')[
186 0]
187 found.append(
188 dict(name=cl.__name__, subfolder=sub, cl=cl, package=pack))
190 if extended:
191 from ...onnx_conv import register_converters
192 with warnings.catch_warnings():
193 warnings.simplefilter("ignore", ResourceWarning)
194 models = register_converters(True)
196 done = set(_['name'] for _ in found)
197 for m in models:
198 try:
199 name = m.__module__.split('.')
200 except AttributeError as e: # pragma: no cover
201 raise AttributeError("Unexpected value, m={}".format(m)) from e
202 sub = '.'.join(name[1:])
203 pack = name[0]
204 if m.__name__ not in done:
205 found.append(
206 dict(name=m.__name__, cl=m, package=pack, sub=sub))
208 # let's remove models which cannot predict
209 all_found = found
210 found = []
211 for mod in all_found:
212 cl = mod['cl']
213 if hasattr(cl, 'fit_predict') and not hasattr(cl, 'predict'):
214 continue
215 if hasattr(cl, 'fit_transform') and not hasattr(cl, 'transform'):
216 continue
217 if (not hasattr(cl, 'transform') and
218 not hasattr(cl, 'predict') and
219 not hasattr(cl, 'decision_function')):
220 continue
221 found.append(mod)
222 return found
225def _measure_time(fct, repeat=1, number=1, first_run=True):
226 """
227 Measures the execution time for a function.
229 :param fct: function to measure
230 :param repeat: number of times to repeat
231 :param number: number of times between two measures
232 :param first_run: if True, runs the function once before measuring
233 :return: last result, average, values
234 """
235 res = None
236 values = []
237 if first_run:
238 fct()
239 for __ in range(repeat):
240 begin = perf_counter()
241 for _ in range(number):
242 res = fct()
243 end = perf_counter()
244 values.append(end - begin)
245 if repeat * number == 1:
246 return res, values[0], values
247 return res, sum(values) / (repeat * number), values # pragma: no cover
250def _shape_exc(obj):
251 if hasattr(obj, 'shape'):
252 return obj.shape
253 if isinstance(obj, (list, dict, tuple)):
254 return "[{%d}]" % len(obj)
255 return None
258def dump_into_folder(dump_folder, obs_op=None, is_error=True,
259 **kwargs):
260 """
261 Dumps information when an error was detected
262 using :epkg:`*py:pickle`.
264 :param dump_folder: dump_folder
265 :param obs_op: obs_op (information)
266 :param is_error: is it an error or not?
267 :param kwargs: additional parameters
268 :return: name
269 """
270 if dump_folder is None:
271 raise ValueError("dump_folder cannot be None.")
272 optim = obs_op.get('optim', '')
273 optim = str(optim)
274 optim = optim.replace("<class 'sklearn.", "")
275 optim = optim.replace("<class '", "")
276 optim = optim.replace(" ", "")
277 optim = optim.replace(">", "")
278 optim = optim.replace("=", "")
279 optim = optim.replace("{", "")
280 optim = optim.replace("}", "")
281 optim = optim.replace(":", "")
282 optim = optim.replace("'", "")
283 optim = optim.replace("/", "")
284 optim = optim.replace("\\", "")
285 parts = (obs_op['runtime'], obs_op['name'], obs_op['scenario'],
286 obs_op['problem'], optim,
287 "op" + str(obs_op.get('opset', '-')),
288 "nf" + str(obs_op.get('n_features', '-')))
289 name = "dump-{}-{}.pkl".format(
290 "ERROR" if is_error else "i",
291 "-".join(map(str, parts)))
292 name = os.path.join(dump_folder, name)
293 obs_op = obs_op.copy()
294 fcts = [k for k in obs_op if k.startswith('lambda')]
295 for fct in fcts:
296 del obs_op[fct]
297 kwargs.update({'obs_op': obs_op})
298 with open(name, "wb") as f:
299 pickle.dump(kwargs, f)
300 return name
303def default_time_kwargs():
304 """
305 Returns default values *number* and *repeat* to measure
306 the execution of a function.
308 .. runpython::
309 :showcode:
310 :warningout: DeprecationWarning
312 from mlprodict.onnxrt.validate.validate_helper import default_time_kwargs
313 import pprint
314 pprint.pprint(default_time_kwargs())
316 keys define the number of rows,
317 values defines *number* and *repeat*.
318 """
319 return {
320 1: dict(number=30, repeat=20),
321 10: dict(number=20, repeat=20),
322 100: dict(number=8, repeat=10),
323 1000: dict(number=5, repeat=5),
324 10000: dict(number=3, repeat=3),
325 }
328def measure_time(stmt, x, repeat=10, number=50, div_by_number=False, first_run=True):
329 """
330 Measures a statement and returns the results as a dictionary.
332 :param stmt: string
333 :param x: matrix
334 :param repeat: average over *repeat* experiment
335 :param number: number of executions in one row
336 :param div_by_number: divide by the number of executions
337 :param first_run: if True, runs the function once before measuring
338 :return: dictionary
340 See `Timer.repeat <https://docs.python.org/3/library/timeit.html?timeit.Timer.repeat>`_
341 for a better understanding of parameter *repeat* and *number*.
342 The function returns a duration corresponding to
343 *number* times the execution of the main statement.
344 """
345 if x is None:
346 raise ValueError("x cannot be None") # pragma: no cover
348 try:
349 stmt(x)
350 except RuntimeError as e: # pragma: no cover
351 raise RuntimeError("{}-{}".format(type(x), x.dtype)) from e
353 def fct():
354 stmt(x)
356 if first_run:
357 fct()
358 tim = Timer(fct)
359 res = numpy.array(tim.repeat(repeat=repeat, number=number))
360 total = numpy.sum(res)
361 if div_by_number:
362 res /= number
363 mean = numpy.mean(res)
364 dev = numpy.mean(res ** 2)
365 dev = max(0, (dev - mean**2)) ** 0.5
366 mes = dict(average=mean, deviation=dev, min_exec=numpy.min(res),
367 max_exec=numpy.max(res), repeat=repeat, number=number,
368 total=total)
369 return mes
372def _multiply_time_kwargs(time_kwargs, time_kwargs_fact, inst):
373 """
374 Multiplies values in *time_kwargs* following strategy
375 *time_kwargs_fact* for a given model *inst*.
377 :param time_kwargs: see below
378 :param time_kwargs_fact: see below
379 :param inst: :epkg:`scikit-learn` model
380 :return: new *time_kwargs*
382 Possible values for *time_kwargs_fact*:
384 - a integer: multiplies *number* by this number
385 - `'lin'`: multiplies value *number* for linear models depending
386 on the number of rows to process (:math:`\\propto 1/\\log_{10}(n)`)
388 .. runpython::
389 :showcode:
390 :warningout: DeprecationWarning
392 from pprint import pprint
393 from sklearn.linear_model import LinearRegression
394 from mlprodict.onnxrt.validate.validate_helper import (
395 default_time_kwargs, _multiply_time_kwargs)
397 lr = LinearRegression()
398 kw = default_time_kwargs()
399 pprint(kw)
401 kw2 = _multiply_time_kwargs(kw, 'lin', lr)
402 pprint(kw2)
403 """
404 if time_kwargs is None:
405 raise ValueError("time_kwargs cannot be None.") # pragma: no cover
406 if time_kwargs_fact in ('', None):
407 return time_kwargs
408 try:
409 vi = int(time_kwargs_fact)
410 time_kwargs_fact = vi
411 except (TypeError, ValueError):
412 pass
413 if isinstance(time_kwargs_fact, int):
414 time_kwargs_modified = copy.deepcopy(time_kwargs)
415 for k in time_kwargs_modified:
416 time_kwargs_modified[k]['number'] *= time_kwargs_fact
417 return time_kwargs_modified
418 if time_kwargs_fact == 'lin':
419 if isinstance(inst, LinearModel):
420 time_kwargs_modified = copy.deepcopy(time_kwargs)
421 for k in time_kwargs_modified:
422 kl = max(int(math.log(k) / math.log(10) + 1e-5), 1)
423 f = max(int(10 / kl + 0.5), 1)
424 time_kwargs_modified[k]['number'] *= f
425 time_kwargs_modified[k]['repeat'] *= 1
426 return time_kwargs_modified
427 return time_kwargs
428 raise ValueError( # pragma: no cover
429 "Unable to interpret time_kwargs_fact='{}'.".format(
430 time_kwargs_fact))
433def _get_problem_data(prob, n_features):
434 data_problem = _problems[prob](n_features=n_features)
435 if len(data_problem) == 6:
436 X_, y_, init_types, method, output_index, Xort_ = data_problem
437 dofit = True
438 elif len(data_problem) == 7:
439 X_, y_, init_types, method, output_index, Xort_, dofit = data_problem
440 else:
441 raise RuntimeError( # pragma: no cover
442 "Unable to interpret problem '{}'.".format(prob))
443 if (len(X_.shape) == 2 and X_.shape[1] != n_features and
444 n_features is not None):
445 raise RuntimeError( # pragma: no cover
446 "Problem '{}' with n_features={} returned {} features"
447 "(func={}).".format(prob, n_features, X_.shape[1],
448 _problems[prob]))
449 if y_ is None:
450 (X_train, X_test, Xort_train, # pylint: disable=W0612
451 Xort_test) = train_test_split(
452 X_, Xort_, random_state=42)
453 y_train, y_test = None, None
454 else:
455 (X_train, X_test, y_train, y_test, # pylint: disable=W0612
456 Xort_train, Xort_test) = train_test_split(
457 X_, y_, Xort_, random_state=42)
458 if isinstance(init_types, tuple):
459 init_types, conv_options = init_types
460 else:
461 conv_options = None
463 if isinstance(method, tuple):
464 method_name, predict_kwargs = method
465 else:
466 method_name = method
467 predict_kwargs = {}
469 return (X_train, X_test, y_train,
470 y_test, Xort_test,
471 init_types, conv_options, method_name,
472 output_index, dofit, predict_kwargs)