Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Validates runtime for many :scikit-learn: operators.
4The submodule relies on :epkg:`onnxconverter_common`,
5:epkg:`sklearn-onnx`.
6"""
7import numpy
8from sklearn.base import (
9 ClusterMixin, BiclusterMixin, OutlierMixin,
10 RegressorMixin, ClassifierMixin)
11from sklearn.calibration import CalibratedClassifierCV
12from sklearn.cross_decomposition import PLSSVD
13from sklearn.datasets import load_iris
14from sklearn.decomposition import LatentDirichletAllocation, NMF
15from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
16from sklearn.ensemble import (
17 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier,
18 BaggingClassifier, VotingClassifier, GradientBoostingClassifier,
19 RandomForestClassifier)
20try:
21 from sklearn.ensemble import StackingClassifier, StackingRegressor
22except ImportError: # pragma: no cover
23 # new in 0.22
24 StackingClassifier, StackingRegressor = None, None
25from sklearn.feature_extraction import DictVectorizer, FeatureHasher
26from sklearn.feature_extraction.text import (
27 CountVectorizer, TfidfVectorizer, TfidfTransformer)
28from sklearn.experimental import enable_hist_gradient_boosting # pylint: disable=W0611
29from sklearn.ensemble import (
30 HistGradientBoostingRegressor,
31 HistGradientBoostingClassifier)
32from sklearn.feature_selection import (
33 RFE, RFECV, GenericUnivariateSelect,
34 SelectPercentile, SelectFwe, SelectKBest,
35 SelectFdr, SelectFpr, SelectFromModel)
36from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
37from sklearn.isotonic import IsotonicRegression
38from sklearn.linear_model import (
39 ARDRegression, ElasticNetCV,
40 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,
41 SGDRegressor, OrthogonalMatchingPursuitCV,
42 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet,
43 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso,
44 PassiveAggressiveClassifier, RidgeClassifier,
45 RidgeClassifierCV, PassiveAggressiveRegressor,
46 HuberRegressor, LogisticRegression, SGDClassifier,
47 LogisticRegressionCV, Perceptron)
48from sklearn.mixture._base import BaseMixture
49from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
50from sklearn.multiclass import (
51 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier)
52from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
53from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB
54from sklearn.neighbors import (
55 NearestCentroid, RadiusNeighborsClassifier,
56 NeighborhoodComponentsAnalysis)
57from sklearn.preprocessing import (
58 LabelBinarizer, LabelEncoder,
59 OneHotEncoder, PowerTransformer)
60from sklearn.semi_supervised import LabelPropagation, LabelSpreading
61from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC
62from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier
63from sklearn.utils import shuffle
64from skl2onnx.common.data_types import (
65 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)
66from ._validate_problems_helper import (
67 _noshapevar, _1d_problem, text_alpha_num)
70def _modify_dimension(X, n_features, seed=19):
71 """
72 Modifies the number of features to increase
73 or reduce the number of features.
75 @param X features matrix
76 @param n_features number of features
77 @param seed random seed (to get the same
78 dataset at each call)
79 @return new featurs matrix
80 """
81 if n_features is None or n_features == X.shape[1]:
82 return X
83 if n_features < X.shape[1]:
84 return X[:, :n_features]
85 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101
86 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype)
87 res[:, :X.shape[1]] = X[:, :]
88 div = max((n_features // X.shape[1]) + 1, 2)
89 for i in range(X.shape[1], res.shape[1]):
90 j = i % X.shape[1]
91 col = X[:, j]
92 if X.dtype in (numpy.float32, numpy.float64):
93 sigma = numpy.var(col) ** 0.5
94 rnd = rstate.randn(len(col)) * sigma / div
95 col2 = col + rnd
96 res[:, j] -= col2 / div
97 res[:, i] = col2
98 elif X.dtype in (numpy.int32, numpy.int64):
99 perm = rstate.permutation(col)
100 h = rstate.randint(0, div) % X.shape[0]
101 col2 = col.copy()
102 col2[h::div] = perm[h::div] # pylint: disable=E1136
103 res[:, i] = col2
104 h = (h + 1) % X.shape[0]
105 res[h, j] = perm[h] # pylint: disable=E1136
106 else: # pragma: no cover
107 raise NotImplementedError( # pragma: no cover
108 "Unable to add noise to a feature for this type {}".format(X.dtype))
109 return res
112###########
113# datasets
114###########
117def _problem_for_predictor_binary_classification(
118 dtype=numpy.float32, n_features=None, add_nan=False):
119 """
120 Returns *X, y, intial_types, method, node name, X runtime* for a
121 binary classification problem.
122 It is based on Iris dataset.
123 """
124 data = load_iris()
125 X = data.data
126 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
127 rnd = state.randn(*X.shape) / 3
128 X += rnd
129 X = _modify_dimension(X, n_features)
130 y = data.target
131 y[y == 2] = 1
132 if add_nan:
133 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
134 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
135 X[rows, cols] = numpy.nan
136 X = X.astype(dtype)
137 y = y.astype(numpy.int64)
138 return (X, y, [('X', X[:1].astype(dtype))],
139 'predict_proba', 1, X.astype(dtype))
142def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None):
143 """
144 Returns *X, y, intial_types, method, node name, X runtime* for a
145 m-cl classification problem.
146 It is based on Iris dataset.
147 """
148 data = load_iris()
149 X = data.data
150 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
151 rnd = state.randn(*X.shape) / 3
152 X += rnd
153 X = _modify_dimension(X, n_features)
154 y = data.target
155 X = X.astype(dtype)
156 y = y.astype(numpy.int64)
157 return (X, y, [('X', X[:1].astype(dtype))],
158 'predict_proba', 1, X.astype(dtype))
161def _problem_for_mixture(dtype=numpy.float32, n_features=None):
162 """
163 Returns *X, y, intial_types, method, node name, X runtime* for a
164 m-cl classification problem.
165 It is based on Iris dataset.
166 """
167 data = load_iris()
168 X = data.data
169 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
170 rnd = state.randn(*X.shape) / 3
171 X += rnd
172 X = _modify_dimension(X, n_features)
173 y = data.target
174 X = X.astype(dtype)
175 y = y.astype(numpy.int64)
176 return (X, None, [('X', X[:1].astype(dtype))],
177 'predict_proba', 1, X.astype(dtype))
180def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None):
181 """
182 Returns *X, y, intial_types, method, node name, X runtime* for a
183 m-cl classification problem.
184 It is based on Iris dataset.
185 """
186 data = load_iris()
187 X = data.data
188 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
189 rnd = state.randn(*X.shape) / 3
190 X += rnd
191 X = _modify_dimension(X, n_features)
192 y = data.target
193 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64)
194 for i, _ in enumerate(y):
195 y2[i, _] = 1
196 for i in range(0, y.shape[0], 5):
197 y2[i, (y[i] + 1) % 3] = 1
198 X = X.astype(dtype)
199 y2 = y2.astype(numpy.int64)
200 return (X, y2, [('X', X[:1].astype(dtype))],
201 'predict_proba', 1, X.astype(dtype))
204def _problem_for_predictor_regression(many_output=False, options=None,
205 n_features=None, nbrows=None,
206 dtype=numpy.float32, add_nan=False,
207 **kwargs):
208 """
209 Returns *X, y, intial_types, method, name, X runtime* for a
210 regression problem.
211 It is based on Iris dataset.
212 """
213 data = load_iris()
214 X = data.data
215 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
216 rnd = state.randn(*X.shape) / 3
217 X += rnd
218 X = _modify_dimension(X, n_features)
219 y = data.target + numpy.arange(len(data.target)) / 100
220 meth = 'predict' if kwargs is None else ('predict', kwargs)
221 itt = [('X', X[:1].astype(dtype))]
222 if n_features is not None:
223 X = X[:, :n_features]
224 itt = [('X', X[:1].astype(dtype))]
225 if nbrows is not None:
226 X = X[:nbrows, :]
227 y = y[:nbrows]
228 itt = [('X', X[:1].astype(dtype))]
229 if options is not None:
230 itt = itt, options
231 if add_nan:
232 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
233 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
234 X[rows, cols] = numpy.nan
235 X = X.astype(dtype)
236 y = y.astype(dtype)
237 return (X, y, itt,
238 meth, 'all' if many_output else 0, X.astype(dtype))
241def _problem_for_predictor_multi_regression(many_output=False, options=None,
242 n_features=None, nbrows=None,
243 dtype=numpy.float32, **kwargs):
244 """
245 Returns *X, y, intial_types, method, name, X runtime* for a
246 mregression problem.
247 It is based on Iris dataset.
248 """
249 data = load_iris()
250 X = data.data
251 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
252 rnd = state.randn(*X.shape) / 3
253 X += rnd
254 X = _modify_dimension(X, n_features)
255 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100
256 meth = 'predict' if kwargs is None else ('predict', kwargs)
257 itt = [('X', X[:1].astype(dtype))]
258 if n_features is not None:
259 X = X[:, :n_features]
260 itt = [('X', X[:1].astype(dtype))]
261 if nbrows is not None:
262 X = X[:nbrows, :]
263 y = y[:nbrows]
264 itt = [('X', X[:1].astype(dtype))]
265 if options is not None:
266 itt = itt, options
267 y2 = numpy.empty((y.shape[0], 2))
268 y2[:, 0] = y
269 y2[:, 1] = y + 0.5
270 X = X.astype(dtype)
271 y2 = y2.astype(dtype)
272 return (X, y2, itt,
273 meth, 'all' if many_output else 0, X.astype(dtype))
276def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None):
277 """
278 Returns *X, intial_types, method, name, X runtime* for a
279 transformation problem.
280 It is based on Iris dataset.
281 """
282 data = load_iris()
283 X = data.data
284 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
285 rnd = state.randn(*X.shape) / 3
286 X += rnd
287 X = _modify_dimension(X, n_features)
288 X = X.astype(dtype)
289 return (X, None, [('X', X[:1].astype(dtype))],
290 'transform', 0, X.astype(dtype=numpy.float32))
293def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None):
294 """
295 Returns *X, intial_types, method, name, X runtime* for a
296 transformation problem.
297 It is based on Iris dataset.
298 """
299 data = load_iris()
300 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
301 rnd = state.randn(*data.data.shape) / 3
302 X = numpy.abs(data.data + rnd)
303 X = _modify_dimension(X, n_features)
304 X = X.astype(dtype)
305 return (X, None, [('X', X[:1].astype(dtype))],
306 'transform', 0, X.astype(dtype=numpy.float32))
309def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None):
310 """
311 Returns *X, intial_types, method, name, X runtime* for a
312 transformation problem.
313 It is based on Iris dataset.
314 """
315 data = load_iris()
316 X = data.data
317 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
318 rnd = state.randn(*X.shape) / 3
319 X += rnd
320 X = _modify_dimension(X, n_features)
321 y = data.target + numpy.arange(len(data.target)) / 100
322 X = X.astype(dtype)
323 y = y.astype(dtype)
324 return (X, y, [('X', X[:1].astype(dtype))],
325 'transform', 0, X.astype(dtype))
328def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None):
329 """
330 Returns *X, intial_types, method, name, X runtime* for a
331 transformation problem.
332 It is based on Iris dataset.
333 """
334 data = load_iris()
335 X = data.data
336 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
337 rnd = state.randn(*X.shape) / 3
338 X += rnd
339 X = _modify_dimension(X, n_features)
340 y = data.target
341 X = X.astype(dtype)
342 y = y.astype(numpy.int64)
343 return (X, y, [('X', X[:1].astype(dtype))],
344 'transform', 0, X.astype(dtype))
347def _problem_for_clustering(dtype=numpy.float32, n_features=None):
348 """
349 Returns *X, intial_types, method, name, X runtime* for a
350 clustering problem.
351 It is based on Iris dataset.
352 """
353 data = load_iris()
354 X = data.data
355 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
356 rnd = state.randn(*X.shape) / 3
357 X += rnd
358 X = _modify_dimension(X, n_features)
359 X = X.astype(dtype)
360 return (X, None, [('X', X[:1].astype(dtype))],
361 'predict', 0, X.astype(dtype))
364def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):
365 """
366 Returns *X, intial_types, method, name, X runtime* for a
367 clustering problem, the score part, not the cluster.
368 It is based on Iris dataset.
369 """
370 data = load_iris()
371 X = data.data
372 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
373 rnd = state.randn(*X.shape) / 3
374 X += rnd
375 X = _modify_dimension(X, n_features)
376 X = X.astype(dtype)
377 return (X, None, [('X', X[:1].astype(dtype))],
378 'transform', 1, X.astype(dtype))
381def _problem_for_outlier(dtype=numpy.float32, n_features=None):
382 """
383 Returns *X, intial_types, method, name, X runtime* for a
384 transformation problem.
385 It is based on Iris dataset.
386 """
387 data = load_iris()
388 X = data.data
389 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
390 rnd = state.randn(*X.shape) / 3
391 X += rnd
392 X = _modify_dimension(X, n_features)
393 X = X.astype(dtype)
394 return (X, None, [('X', X[:1].astype(dtype))],
395 'predict', 0, X.astype(dtype))
398def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None):
399 """
400 Returns *X, y, intial_types, method, name, X runtime* for a
401 scoring problem.
402 It is based on Iris dataset.
403 """
404 data = load_iris()
405 X = data.data
406 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
407 rnd = state.randn(*X.shape) / 3
408 X += rnd
409 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100
410 y /= numpy.max(y)
411 X = X.astype(dtype)
412 y = y.astype(dtype)
413 return (X, y, [('X', X[:1].astype(dtype))],
414 'score', 0, X.astype(dtype))
417def _problem_for_clnoproba(dtype=numpy.float32, n_features=None):
418 """
419 Returns *X, y, intial_types, method, name, X runtime* for a
420 scoring problem.
421 It is based on Iris dataset.
422 """
423 data = load_iris()
424 X = data.data
425 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
426 rnd = state.randn(*X.shape) / 3
427 X += rnd
428 X = _modify_dimension(X, n_features)
429 y = data.target
430 X = X.astype(dtype)
431 y = y.astype(numpy.int64)
432 return (X, y, [('X', X[:1].astype(dtype))],
433 'predict', 0, X.astype(dtype))
436def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False):
437 """
438 Returns *X, y, intial_types, method, name, X runtime* for a
439 scoring problem. Binary classification.
440 It is based on Iris dataset.
441 """
442 data = load_iris()
443 X = data.data
444 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
445 rnd = state.randn(*X.shape) / 3
446 X += rnd
447 X = _modify_dimension(X, n_features)
448 y = data.target
449 y[y == 2] = 1
450 if add_nan:
451 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
452 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
453 X[rows, cols] = numpy.nan
454 X = X.astype(dtype)
455 y = y.astype(numpy.int64)
456 return (X, y, [('X', X[:1].astype(dtype))],
457 'predict', 0, X.astype(dtype))
460def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None):
461 """
462 Returns *X, y, intial_types, method, name, X runtime* for a
463 scoring problem.
464 It is based on Iris dataset.
465 """
466 data = load_iris()
467 X = data.data
468 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
469 rnd = state.randn(*X.shape) / 3
470 X += rnd
471 X = _modify_dimension(X, n_features)
472 y = data.target
473 X = X.astype(dtype)
474 y = y.astype(numpy.int64)
475 return (X, y, [('X', X[:1].astype(dtype))],
476 'decision_function', 1, X.astype(dtype))
479def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None):
480 """
481 Returns *X, y, intial_types, method, name, X runtime* for a
482 scoring problem. Binary classification.
483 It is based on Iris dataset.
484 """
485 data = load_iris()
486 X = data.data
487 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
488 rnd = state.randn(*X.shape) / 3
489 X += rnd
490 X = _modify_dimension(X, n_features)
491 y = data.target
492 y[y == 2] = 1
493 X = X.astype(dtype)
494 y = y.astype(numpy.int64)
495 return (X, y, [('X', X[:1].astype(dtype))],
496 'decision_function', 1, X.astype(dtype))
499def _problem_for_label_encoder(dtype=numpy.int64, n_features=None):
500 """
501 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`.
502 """
503 data = load_iris()
504 # X = data.data
505 y = data.target.astype(dtype)
506 itt = [('X', y[:1].astype(dtype))]
507 y = y.astype(dtype)
508 return (y, None, itt, 'transform', 0, y)
511def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None):
512 """
513 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.
514 """
515 data = load_iris()
516 # X = data.data
517 y = data.target
518 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)]
519 y2[0][2] = -2
520 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
521 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]
522 y2 = numpy.array(y2)
523 y = y.astype(numpy.int64)
524 return (y2, y, itt, 'transform', 0, y2)
527def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):
528 """
529 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.
530 """
531 X = numpy.array([_[0] for _ in text_alpha_num])
532 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
533 itt = [("X", StringTensorType([None]))]
534 return (X, y, itt, 'transform', 0, X)
537def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None):
538 """
539 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`.
540 """
541 X = numpy.array([_[0] for _ in text_alpha_num])
542 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
543 X2 = CountVectorizer().fit_transform(X).astype(dtype)
544 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
545 itt = [("X", cltype([None, X2.shape[1]]))]
546 return (X2, y, itt, 'transform', 0, X2)
549def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None):
550 """
551 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.
552 """
553 data = load_iris()
554 # X = data.data
555 y = data.target
556 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)]
557 y2[0]["cl2"] = -2
558 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
559 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]
560 y2 = numpy.array(y2)
561 return (y2, y, itt, 'transform', 0, y2)
564def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None):
565 """
566 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`.
567 """
568 data = load_iris()
569 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
570 rnd = state.randn(*data.data.shape) / 3
571 X = _modify_dimension(data.data + rnd, n_features)
572 X = X.astype(numpy.int32).astype(dtype)
573 y = data.target
574 X, y = shuffle(X, y, random_state=1)
575 itt = [('X', X[:1].astype(dtype))]
576 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype))
579def find_suitable_problem(model):
580 """
581 Determines problems suitable for a given
582 :epkg:`scikit-learn` operator. It may be
584 * `b-cl`: binary classification
585 * `m-cl`: m-cl classification
586 * `m-label`: classification m-label
587 (multiple labels possible at the same time)
588 * `reg`: regression
589 * `m-reg`: regression multi-output
590 * `num-tr`: transform numerical features
591 * `num-tr-pos`: transform numerical positive features
592 * `scoring`: transform numerical features, target is usually needed
593 * `outlier`: outlier prediction
594 * `linearsvc`: classifier without *predict_proba*
595 * `cluster`: similar to transform
596 * `num+y-tr`: similar to transform with targets
597 * `num+y-tr-cl`: similar to transform with classes
598 * `num-tr-clu`: similar to cluster, but returns
599 scores or distances instead of cluster
600 * `key-col`: list of dictionaries
601 * `text-col`: one column of text
603 Suffix `nofit` indicates the predictions happens
604 without the model being fitted. This is the case
605 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`.
606 The suffix `-cov` indicates the method `predict` was called
607 with parameter ``return_cov=True``, `-std` tells
608 method `predict` was called with parameter ``return_std=True``.
609 The suffix ``-NSV`` creates an input variable
610 like the following ``[('X', FloatTensorType([None, None]))]``.
611 That's a way to bypass :epkg:`onnxruntime` shape checking
612 as one part of the graph is designed to handle any
613 kind of dimensions but apparently, if the input shape is
614 precise, every part of the graph has to be precise. The strings
615 used variables which means it is at the same time precise
616 and unprecise. Suffix ``'-64'`` means the model will
617 do double computations. Suffix ``-nop`` means the classifier
618 does not implement method *predict_proba*. Suffix ``-1d``
619 means a one dimension problem (one feature). Suffix ``-dec``
620 checks method `decision_function`.
622 The following script gives the list of :epkg:`scikit-learn`
623 models and the problem they can be fitted on.
625 .. runpython::
626 :showcode:
627 :warningout: DeprecationWarning
628 :rst:
630 from mlprodict.onnxrt.validate.validate import (
631 sklearn_operators, find_suitable_problem)
632 from pyquickhelper.pandashelper import df2rst
633 from pandas import DataFrame
634 res = sklearn_operators()
635 rows = []
636 for model in res[:20]:
637 name = model['name']
638 row = dict(name=name)
639 try:
640 prob = find_suitable_problem(model['cl'])
641 if prob is None:
642 continue
643 for p in prob:
644 row[p] = 'X'
645 except RuntimeError:
646 pass
647 rows.append(row)
648 df = DataFrame(rows).set_index('name')
649 df = df.sort_index()
650 print(df2rst(df, index=True))
652 The list is truncated. The full list can be found at
653 :ref:`l-model-problem-list`.
654 """
655 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem
657 def _internal(model): # pylint: disable=R0911
659 # checks that this model is not overwritten by this module
660 ext = ext_find_suitable_problem(model)
661 if ext is not None:
662 return ext
664 # Exceptions
665 if model in {GaussianProcessRegressor}:
666 # m-reg causes MemoryError on some machine.
667 return ['~b-reg-NF-64', # '~m-reg-NF-64',
668 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64',
669 '~b-reg-NF-std-64', # '~m-reg-NF-std-64',
670 '~b-reg-NSV-64', # '~m-reg-NSV-64',
671 '~b-reg-cov-64', # '~m-reg-cov-64',
672 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64',
673 'b-reg', '~b-reg-64', # 'm-reg'
674 ]
676 if model in {DictVectorizer}:
677 return ['key-int-col']
679 if model in {TfidfVectorizer, CountVectorizer}:
680 return ['text-col']
682 if model in {TfidfTransformer}:
683 return ['bow']
685 if model in {FeatureHasher}:
686 return ['key-str-col']
688 if model in {OneHotEncoder}:
689 return ['one-hot']
691 if model in {LabelBinarizer, LabelEncoder}:
692 return ['int-col']
694 if model in {NuSVC, SVC, SGDClassifier,
695 HistGradientBoostingClassifier}:
696 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan']
698 if model in {GaussianProcessClassifier}:
699 return ['b-cl', 'm-cl', '~b-cl-64']
701 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV,
702 ComplementNB, GaussianNB,
703 GradientBoostingClassifier, LabelPropagation, LabelSpreading,
704 LinearDiscriminantAnalysis, LogisticRegressionCV,
705 MultinomialNB, QuadraticDiscriminantAnalysis,
706 RandomizedSearchCV}:
707 return ['b-cl', 'm-cl']
709 if model in {Perceptron}:
710 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec']
712 if model in {AdaBoostRegressor}:
713 return ['b-reg', '~b-reg-64']
715 if model in {HistGradientBoostingRegressor}:
716 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64']
718 if model in {LinearSVC, NearestCentroid}:
719 return ['~b-cl-nop', '~b-cl-nop-64']
721 if model in {RFE, RFECV}:
722 return ['num+y-tr']
724 if model in {GridSearchCV}:
725 return ['b-cl', 'm-cl',
726 'b-reg', 'm-reg',
727 '~b-reg-64', '~b-cl-64',
728 'cluster', 'outlier', '~m-label']
730 if model in {VotingClassifier}:
731 return ['b-cl', 'm-cl']
733 if StackingClassifier is not None and model in {StackingClassifier}:
734 return ['b-cl']
736 if StackingRegressor is not None and model in {StackingRegressor}:
737 return ['b-reg']
739 # specific scenarios
740 if model in {IsotonicRegression}:
741 return ['~num+y-tr-1d', '~b-reg-1d']
743 if model in {ARDRegression, BayesianRidge, ElasticNetCV,
744 GradientBoostingRegressor,
745 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,
746 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV,
747 PassiveAggressiveRegressor, SGDRegressor,
748 TheilSenRegressor, HuberRegressor, SVR}:
749 return ['b-reg', '~b-reg-64']
751 if model in {MultiOutputClassifier}:
752 return ['m-cl', '~m-label']
754 if model in {MultiOutputRegressor, MultiTaskElasticNet,
755 MultiTaskElasticNetCV, MultiTaskLassoCV,
756 MultiTaskLasso}:
757 return ['m-reg']
759 if model in {OneVsOneClassifier, OutputCodeClassifier,
760 PassiveAggressiveClassifier, RadiusNeighborsClassifier}:
761 return ['~b-cl-nop', '~m-cl-nop']
763 if model in {RidgeClassifier, RidgeClassifierCV}:
764 return ['~b-cl-nop', '~m-cl-nop', '~m-label']
766 # trainable transform
767 if model in {GenericUnivariateSelect,
768 NeighborhoodComponentsAnalysis,
769 PLSSVD, SelectKBest,
770 SelectPercentile, SelectFromModel}:
771 return ["num+y-tr"]
773 if model in {SelectFwe, SelectFdr, SelectFpr}:
774 return ["num+y-tr-cl"]
776 # no m-label
777 if model in {AdaBoostClassifier}:
778 return ['b-cl', '~b-cl-64', 'm-cl']
780 if model in {LogisticRegression}:
781 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec']
783 if model in {RandomForestClassifier}:
784 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']
786 if model in {DecisionTreeClassifier, ExtraTreeClassifier}:
787 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label']
789 if model in {DecisionTreeRegressor}:
790 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100']
792 if model in {LatentDirichletAllocation, NMF, PowerTransformer}:
793 return ['num-tr-pos']
795 if hasattr(model, 'predict'):
796 if "Classifier" in str(model):
797 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']
798 elif "Regressor" in str(model):
799 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']
801 # Generic case.
802 res = []
803 if hasattr(model, 'transform'):
804 if issubclass(model, (RegressorMixin, ClassifierMixin)):
805 res.extend(['num+y-tr'])
806 elif issubclass(model, (ClusterMixin, BiclusterMixin)):
807 res.extend(['~num-tr-clu', '~num-tr-clu-64'])
808 else:
809 res.extend(['num-tr'])
811 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)):
812 res.extend(['cluster', '~b-clu-64'])
814 if issubclass(model, (OutlierMixin)):
815 res.extend(['outlier'])
817 if issubclass(model, ClassifierMixin):
818 if model is OneVsRestClassifier:
819 return ['m-cl', '~m-label']
820 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label'])
821 if issubclass(model, RegressorMixin):
822 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'])
823 if issubclass(model, BaseMixture):
824 res.extend(['mix', '~mix-64'])
826 if len(res) > 0:
827 return res
829 raise RuntimeError("Unable to find problem for model '{}' - {}."
830 "".format(model.__name__, model.__bases__))
832 res = _internal(model)
833 for r in res:
834 if r not in _problems:
835 raise ValueError( # pragma: no cover
836 "Unrecognized problem '{}' in\n{}".format(
837 r, "\n".join(sorted(_problems))))
838 return res
841_problems = {
842 # standard
843 "b-cl": _problem_for_predictor_binary_classification,
844 "m-cl": _problem_for_predictor_multi_classification,
845 "b-reg": _problem_for_predictor_regression,
846 "m-reg": _problem_for_predictor_multi_regression,
847 "num-tr": _problem_for_numerical_transform,
848 "num-tr-pos": _problem_for_numerical_transform_positive,
849 'outlier': _problem_for_outlier,
850 'cluster': _problem_for_clustering,
851 'num+y-tr': _problem_for_numerical_trainable_transform,
852 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl,
853 'mix': _problem_for_mixture,
854 # others
855 '~num-tr-clu': _problem_for_clustering_scores,
856 "~m-label": _problem_for_predictor_multi_classification_label,
857 "~scoring": _problem_for_numerical_scoring,
858 '~b-cl-nop': _problem_for_clnoproba_binary,
859 '~m-cl-nop': _problem_for_clnoproba,
860 '~b-cl-dec': _problem_for_cl_decision_function_binary,
861 '~m-cl-dec': _problem_for_cl_decision_function,
862 # nan
863 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression(
864 n_features=n_features, add_nan=True),
865 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression(
866 dtype=numpy.float64, n_features=n_features, add_nan=True),
867 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification(
868 dtype=dtype, n_features=n_features, add_nan=True),
869 # 100 features
870 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression(
871 n_features=n_features or 100),
872 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification(
873 n_features=n_features or 100),
874 # 64
875 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification(
876 dtype=numpy.float64, n_features=n_features),
877 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression(
878 dtype=numpy.float64, n_features=n_features),
879 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba(
880 dtype=numpy.float64, n_features=n_features),
881 '~b-clu-64': lambda n_features=None: _problem_for_clustering(
882 dtype=numpy.float64, n_features=n_features),
883 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary(
884 dtype=numpy.float64, n_features=n_features),
885 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores(
886 dtype=numpy.float64, n_features=n_features),
887 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression(
888 dtype=numpy.float64, n_features=n_features),
889 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform(
890 dtype=numpy.float64, n_features=n_features),
891 '~mix-64': lambda n_features=None: _problem_for_mixture(
892 dtype=numpy.float64, n_features=n_features),
893 #
894 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification(
895 n_features=n_features) + (False, )),
896 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification(
897 n_features=n_features) + (False, )),
898 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression(
899 n_features=n_features) + (False, )),
900 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression(
901 n_features=n_features) + (False, )),
902 #
903 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification(
904 dtype=numpy.float64, n_features=n_features) + (False, )),
905 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification(
906 dtype=numpy.float64, n_features=n_features) + (False, )),
907 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression(
908 dtype=numpy.float64, n_features=n_features) + (False, )),
909 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
910 dtype=numpy.float64, n_features=n_features) + (False, )),
911 # GaussianProcess
912 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression(
913 True, options={GaussianProcessRegressor: {"return_cov": True}},
914 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),
915 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
916 True, options={GaussianProcessRegressor: {"return_cov": True}},
917 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),
918 #
919 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression(
920 True, options={GaussianProcessRegressor: {"return_std": True}},
921 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),
922 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
923 True, options={GaussianProcessRegressor: {"return_std": True}},
924 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),
925 #
926 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression(
927 True, options={GaussianProcessRegressor: {"return_cov": True}},
928 return_cov=True, dtype=numpy.float64, n_features=n_features)),
929 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
930 True, options={GaussianProcessRegressor: {"return_cov": True}},
931 return_cov=True, dtype=numpy.float64, n_features=n_features)),
932 #
933 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression(
934 True, options={GaussianProcessRegressor: {"return_std": True}},
935 return_std=True, dtype=numpy.float64, n_features=n_features)),
936 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
937 True, options={GaussianProcessRegressor: {"return_std": True}},
938 return_std=True, dtype=numpy.float64, n_features=n_features)),
939 #
940 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression(
941 dtype=numpy.float64, n_features=n_features)),
942 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(
943 dtype=numpy.float64, n_features=n_features)),
944 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression(
945 True, options={GaussianProcessRegressor: {"return_std": True}},
946 return_std=True, dtype=numpy.float64, n_features=n_features))),
947 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(
948 True, options={GaussianProcessRegressor: {"return_std": True}},
949 return_std=True, dtype=numpy.float64, n_features=n_features))),
950 # isotonic
951 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression),
952 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform),
953 # text
954 "key-int-col": _problem_for_dict_vectorizer,
955 "key-str-col": _problem_for_feature_hasher,
956 "int-col": _problem_for_label_encoder,
957 "one-hot": _problem_for_one_hot_encoder,
958 'text-col': _problem_for_tfidf_vectorizer,
959 'bow': _problem_for_tfidf_transformer,
960}