Coverage for mlprodict/onnxrt/validate/validate

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Validates runtime for many :scikit-learn: operators.

4The submodule relies on :epkg:`onnxconverter_common`,

5:epkg:`sklearn-onnx`.

6"""

7import numpy

8from sklearn.base import (

9 ClusterMixin, BiclusterMixin, OutlierMixin,

10 RegressorMixin, ClassifierMixin)

11from sklearn.calibration import CalibratedClassifierCV

12from sklearn.cross_decomposition import PLSSVD

13from sklearn.datasets import load_iris

14from sklearn.decomposition import LatentDirichletAllocation, NMF

15from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

16from sklearn.ensemble import (

17 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier,

18 BaggingClassifier, VotingClassifier, GradientBoostingClassifier,

19 RandomForestClassifier)

20try:

21 from sklearn.ensemble import StackingClassifier, StackingRegressor

22except ImportError: # pragma: no cover

23 # new in 0.22

24 StackingClassifier, StackingRegressor = None, None

25from sklearn.feature_extraction import DictVectorizer, FeatureHasher

26from sklearn.feature_extraction.text import (

27 CountVectorizer, TfidfVectorizer, TfidfTransformer)

28from sklearn.experimental import enable_hist_gradient_boosting # pylint: disable=W0611

29from sklearn.ensemble import (

30 HistGradientBoostingRegressor,

31 HistGradientBoostingClassifier)

32from sklearn.feature_selection import (

33 RFE, RFECV, GenericUnivariateSelect,

34 SelectPercentile, SelectFwe, SelectKBest,

35 SelectFdr, SelectFpr, SelectFromModel)

36from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor

37from sklearn.isotonic import IsotonicRegression

38from sklearn.linear_model import (

39 ARDRegression, ElasticNetCV,

40 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,

41 SGDRegressor, OrthogonalMatchingPursuitCV,

42 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet,

43 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso,

44 PassiveAggressiveClassifier, RidgeClassifier,

45 RidgeClassifierCV, PassiveAggressiveRegressor,

46 HuberRegressor, LogisticRegression, SGDClassifier,

47 LogisticRegressionCV, Perceptron)

48from sklearn.mixture._base import BaseMixture

49from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

50from sklearn.multiclass import (

51 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier)

52from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier

53from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB

54from sklearn.neighbors import (

55 NearestCentroid, RadiusNeighborsClassifier,

56 NeighborhoodComponentsAnalysis)

57from sklearn.preprocessing import (

58 LabelBinarizer, LabelEncoder,

59 OneHotEncoder, PowerTransformer)

60from sklearn.semi_supervised import LabelPropagation, LabelSpreading

61from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC

62from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier

63from sklearn.utils import shuffle

64from skl2onnx.common.data_types import (

65 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)

66from ._validate_problems_helper import (

67 _noshapevar, _1d_problem, text_alpha_num)

70def _modify_dimension(X, n_features, seed=19):

71 """

72 Modifies the number of features to increase

73 or reduce the number of features.

75 @param X features matrix

76 @param n_features number of features

77 @param seed random seed (to get the same

78 dataset at each call)

79 @return new featurs matrix

80 """

81 if n_features is None or n_features == X.shape[1]:

82 return X

83 if n_features < X.shape[1]:

84 return X[:, :n_features]

85 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101

86 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype)

87 res[:, :X.shape[1]] = X[:, :]

88 div = max((n_features // X.shape[1]) + 1, 2)

89 for i in range(X.shape[1], res.shape[1]):

90 j = i % X.shape[1]

91 col = X[:, j]

92 if X.dtype in (numpy.float32, numpy.float64):

93 sigma = numpy.var(col) ** 0.5

94 rnd = rstate.randn(len(col)) * sigma / div

95 col2 = col + rnd

96 res[:, j] -= col2 / div

97 res[:, i] = col2

98 elif X.dtype in (numpy.int32, numpy.int64):

99 perm = rstate.permutation(col)

100 h = rstate.randint(0, div) % X.shape[0]

101 col2 = col.copy()

102 col2[h::div] = perm[h::div] # pylint: disable=E1136

103 res[:, i] = col2

104 h = (h + 1) % X.shape[0]

105 res[h, j] = perm[h] # pylint: disable=E1136

106 else: # pragma: no cover

107 raise NotImplementedError( # pragma: no cover

108 "Unable to add noise to a feature for this type {}".format(X.dtype))

109 return res

110

111

112###########

113# datasets

114###########

115

116

117def _problem_for_predictor_binary_classification(

118 dtype=numpy.float32, n_features=None, add_nan=False):

119 """

120 Returns *X, y, intial_types, method, node name, X runtime* for a

121 binary classification problem.

122 It is based on Iris dataset.

123 """

124 data = load_iris()

125 X = data.data

126 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

127 rnd = state.randn(*X.shape) / 3

128 X += rnd

129 X = _modify_dimension(X, n_features)

130 y = data.target

131 y[y == 2] = 1

132 if add_nan:

133 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

134 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

135 X[rows, cols] = numpy.nan

136 X = X.astype(dtype)

137 y = y.astype(numpy.int64)

138 return (X, y, [('X', X[:1].astype(dtype))],

139 'predict_proba', 1, X.astype(dtype))

140

141

142def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None):

143 """

144 Returns *X, y, intial_types, method, node name, X runtime* for a

145 m-cl classification problem.

146 It is based on Iris dataset.

147 """

148 data = load_iris()

149 X = data.data

150 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

151 rnd = state.randn(*X.shape) / 3

152 X += rnd

153 X = _modify_dimension(X, n_features)

154 y = data.target

155 X = X.astype(dtype)

156 y = y.astype(numpy.int64)

157 return (X, y, [('X', X[:1].astype(dtype))],

158 'predict_proba', 1, X.astype(dtype))

159

160

161def _problem_for_mixture(dtype=numpy.float32, n_features=None):

162 """

163 Returns *X, y, intial_types, method, node name, X runtime* for a

164 m-cl classification problem.

165 It is based on Iris dataset.

166 """

167 data = load_iris()

168 X = data.data

169 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

170 rnd = state.randn(*X.shape) / 3

171 X += rnd

172 X = _modify_dimension(X, n_features)

173 y = data.target

174 X = X.astype(dtype)

175 y = y.astype(numpy.int64)

176 return (X, None, [('X', X[:1].astype(dtype))],

177 'predict_proba', 1, X.astype(dtype))

178

179

180def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None):

181 """

182 Returns *X, y, intial_types, method, node name, X runtime* for a

183 m-cl classification problem.

184 It is based on Iris dataset.

185 """

186 data = load_iris()

187 X = data.data

188 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

189 rnd = state.randn(*X.shape) / 3

190 X += rnd

191 X = _modify_dimension(X, n_features)

192 y = data.target

193 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64)

194 for i, _ in enumerate(y):

195 y2[i, _] = 1

196 for i in range(0, y.shape[0], 5):

197 y2[i, (y[i] + 1) % 3] = 1

198 X = X.astype(dtype)

199 y2 = y2.astype(numpy.int64)

200 return (X, y2, [('X', X[:1].astype(dtype))],

201 'predict_proba', 1, X.astype(dtype))

202

203

204def _problem_for_predictor_regression(many_output=False, options=None,

205 n_features=None, nbrows=None,

206 dtype=numpy.float32, add_nan=False,

207 **kwargs):

208 """

209 Returns *X, y, intial_types, method, name, X runtime* for a

210 regression problem.

211 It is based on Iris dataset.

212 """

213 data = load_iris()

214 X = data.data

215 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

216 rnd = state.randn(*X.shape) / 3

217 X += rnd

218 X = _modify_dimension(X, n_features)

219 y = data.target + numpy.arange(len(data.target)) / 100

220 meth = 'predict' if kwargs is None else ('predict', kwargs)

221 itt = [('X', X[:1].astype(dtype))]

222 if n_features is not None:

223 X = X[:, :n_features]

224 itt = [('X', X[:1].astype(dtype))]

225 if nbrows is not None:

226 X = X[:nbrows, :]

227 y = y[:nbrows]

228 itt = [('X', X[:1].astype(dtype))]

229 if options is not None:

230 itt = itt, options

231 if add_nan:

232 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

233 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

234 X[rows, cols] = numpy.nan

235 X = X.astype(dtype)

236 y = y.astype(dtype)

237 return (X, y, itt,

238 meth, 'all' if many_output else 0, X.astype(dtype))

239

240

241def _problem_for_predictor_multi_regression(many_output=False, options=None,

242 n_features=None, nbrows=None,

243 dtype=numpy.float32, **kwargs):

244 """

245 Returns *X, y, intial_types, method, name, X runtime* for a

246 mregression problem.

247 It is based on Iris dataset.

248 """

249 data = load_iris()

250 X = data.data

251 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

252 rnd = state.randn(*X.shape) / 3

253 X += rnd

254 X = _modify_dimension(X, n_features)

255 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100

256 meth = 'predict' if kwargs is None else ('predict', kwargs)

257 itt = [('X', X[:1].astype(dtype))]

258 if n_features is not None:

259 X = X[:, :n_features]

260 itt = [('X', X[:1].astype(dtype))]

261 if nbrows is not None:

262 X = X[:nbrows, :]

263 y = y[:nbrows]

264 itt = [('X', X[:1].astype(dtype))]

265 if options is not None:

266 itt = itt, options

267 y2 = numpy.empty((y.shape[0], 2))

268 y2[:, 0] = y

269 y2[:, 1] = y + 0.5

270 X = X.astype(dtype)

271 y2 = y2.astype(dtype)

272 return (X, y2, itt,

273 meth, 'all' if many_output else 0, X.astype(dtype))

274

275

276def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None):

277 """

278 Returns *X, intial_types, method, name, X runtime* for a

279 transformation problem.

280 It is based on Iris dataset.

281 """

282 data = load_iris()

283 X = data.data

284 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

285 rnd = state.randn(*X.shape) / 3

286 X += rnd

287 X = _modify_dimension(X, n_features)

288 X = X.astype(dtype)

289 return (X, None, [('X', X[:1].astype(dtype))],

290 'transform', 0, X.astype(dtype=numpy.float32))

291

292

293def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None):

294 """

295 Returns *X, intial_types, method, name, X runtime* for a

296 transformation problem.

297 It is based on Iris dataset.

298 """

299 data = load_iris()

300 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

301 rnd = state.randn(*data.data.shape) / 3

302 X = numpy.abs(data.data + rnd)

303 X = _modify_dimension(X, n_features)

304 X = X.astype(dtype)

305 return (X, None, [('X', X[:1].astype(dtype))],

306 'transform', 0, X.astype(dtype=numpy.float32))

307

308

309def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None):

310 """

311 Returns *X, intial_types, method, name, X runtime* for a

312 transformation problem.

313 It is based on Iris dataset.

314 """

315 data = load_iris()

316 X = data.data

317 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

318 rnd = state.randn(*X.shape) / 3

319 X += rnd

320 X = _modify_dimension(X, n_features)

321 y = data.target + numpy.arange(len(data.target)) / 100

322 X = X.astype(dtype)

323 y = y.astype(dtype)

324 return (X, y, [('X', X[:1].astype(dtype))],

325 'transform', 0, X.astype(dtype))

326

327

328def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None):

329 """

330 Returns *X, intial_types, method, name, X runtime* for a

331 transformation problem.

332 It is based on Iris dataset.

333 """

334 data = load_iris()

335 X = data.data

336 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

337 rnd = state.randn(*X.shape) / 3

338 X += rnd

339 X = _modify_dimension(X, n_features)

340 y = data.target

341 X = X.astype(dtype)

342 y = y.astype(numpy.int64)

343 return (X, y, [('X', X[:1].astype(dtype))],

344 'transform', 0, X.astype(dtype))

345

346

347def _problem_for_clustering(dtype=numpy.float32, n_features=None):

348 """

349 Returns *X, intial_types, method, name, X runtime* for a

350 clustering problem.

351 It is based on Iris dataset.

352 """

353 data = load_iris()

354 X = data.data

355 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

356 rnd = state.randn(*X.shape) / 3

357 X += rnd

358 X = _modify_dimension(X, n_features)

359 X = X.astype(dtype)

360 return (X, None, [('X', X[:1].astype(dtype))],

361 'predict', 0, X.astype(dtype))

362

363

364def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):

365 """

366 Returns *X, intial_types, method, name, X runtime* for a

367 clustering problem, the score part, not the cluster.

368 It is based on Iris dataset.

369 """

370 data = load_iris()

371 X = data.data

372 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

373 rnd = state.randn(*X.shape) / 3

374 X += rnd

375 X = _modify_dimension(X, n_features)

376 X = X.astype(dtype)

377 return (X, None, [('X', X[:1].astype(dtype))],

378 'transform', 1, X.astype(dtype))

379

380

381def _problem_for_outlier(dtype=numpy.float32, n_features=None):

382 """

383 Returns *X, intial_types, method, name, X runtime* for a

384 transformation problem.

385 It is based on Iris dataset.

386 """

387 data = load_iris()

388 X = data.data

389 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

390 rnd = state.randn(*X.shape) / 3

391 X += rnd

392 X = _modify_dimension(X, n_features)

393 X = X.astype(dtype)

394 return (X, None, [('X', X[:1].astype(dtype))],

395 'predict', 0, X.astype(dtype))

396

397

398def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None):

399 """

400 Returns *X, y, intial_types, method, name, X runtime* for a

401 scoring problem.

402 It is based on Iris dataset.

403 """

404 data = load_iris()

405 X = data.data

406 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

407 rnd = state.randn(*X.shape) / 3

408 X += rnd

409 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100

410 y /= numpy.max(y)

411 X = X.astype(dtype)

412 y = y.astype(dtype)

413 return (X, y, [('X', X[:1].astype(dtype))],

414 'score', 0, X.astype(dtype))

415

416

417def _problem_for_clnoproba(dtype=numpy.float32, n_features=None):

418 """

419 Returns *X, y, intial_types, method, name, X runtime* for a

420 scoring problem.

421 It is based on Iris dataset.

422 """

423 data = load_iris()

424 X = data.data

425 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

426 rnd = state.randn(*X.shape) / 3

427 X += rnd

428 X = _modify_dimension(X, n_features)

429 y = data.target

430 X = X.astype(dtype)

431 y = y.astype(numpy.int64)

432 return (X, y, [('X', X[:1].astype(dtype))],

433 'predict', 0, X.astype(dtype))

434

435

436def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False):

437 """

438 Returns *X, y, intial_types, method, name, X runtime* for a

439 scoring problem. Binary classification.

440 It is based on Iris dataset.

441 """

442 data = load_iris()

443 X = data.data

444 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

445 rnd = state.randn(*X.shape) / 3

446 X += rnd

447 X = _modify_dimension(X, n_features)

448 y = data.target

449 y[y == 2] = 1

450 if add_nan:

451 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

452 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

453 X[rows, cols] = numpy.nan

454 X = X.astype(dtype)

455 y = y.astype(numpy.int64)

456 return (X, y, [('X', X[:1].astype(dtype))],

457 'predict', 0, X.astype(dtype))

458

459

460def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None):

461 """

462 Returns *X, y, intial_types, method, name, X runtime* for a

463 scoring problem.

464 It is based on Iris dataset.

465 """

466 data = load_iris()

467 X = data.data

468 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

469 rnd = state.randn(*X.shape) / 3

470 X += rnd

471 X = _modify_dimension(X, n_features)

472 y = data.target

473 X = X.astype(dtype)

474 y = y.astype(numpy.int64)

475 return (X, y, [('X', X[:1].astype(dtype))],

476 'decision_function', 1, X.astype(dtype))

477

478

479def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None):

480 """

481 Returns *X, y, intial_types, method, name, X runtime* for a

482 scoring problem. Binary classification.

483 It is based on Iris dataset.

484 """

485 data = load_iris()

486 X = data.data

487 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

488 rnd = state.randn(*X.shape) / 3

489 X += rnd

490 X = _modify_dimension(X, n_features)

491 y = data.target

492 y[y == 2] = 1

493 X = X.astype(dtype)

494 y = y.astype(numpy.int64)

495 return (X, y, [('X', X[:1].astype(dtype))],

496 'decision_function', 1, X.astype(dtype))

497

498

499def _problem_for_label_encoder(dtype=numpy.int64, n_features=None):

500 """

501 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`.

502 """

503 data = load_iris()

504 # X = data.data

505 y = data.target.astype(dtype)

506 itt = [('X', y[:1].astype(dtype))]

507 y = y.astype(dtype)

508 return (y, None, itt, 'transform', 0, y)

509

510

511def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None):

512 """

513 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.

514 """

515 data = load_iris()

516 # X = data.data

517 y = data.target

518 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)]

519 y2[0][2] = -2

520 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

521 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]

522 y2 = numpy.array(y2)

523 y = y.astype(numpy.int64)

524 return (y2, y, itt, 'transform', 0, y2)

525

526

527def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):

528 """

529 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.

530 """

531 X = numpy.array([_[0] for _ in text_alpha_num])

532 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)

533 itt = [("X", StringTensorType([None]))]

534 return (X, y, itt, 'transform', 0, X)

535

536

537def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None):

538 """

539 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`.

540 """

541 X = numpy.array([_[0] for _ in text_alpha_num])

542 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)

543 X2 = CountVectorizer().fit_transform(X).astype(dtype)

544 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

545 itt = [("X", cltype([None, X2.shape[1]]))]

546 return (X2, y, itt, 'transform', 0, X2)

547

548

549def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None):

550 """

551 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.

552 """

553 data = load_iris()

554 # X = data.data

555 y = data.target

556 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)]

557 y2[0]["cl2"] = -2

558 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

559 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]

560 y2 = numpy.array(y2)

561 return (y2, y, itt, 'transform', 0, y2)

562

563

564def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None):

565 """

566 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`.

567 """

568 data = load_iris()

569 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

570 rnd = state.randn(*data.data.shape) / 3

571 X = _modify_dimension(data.data + rnd, n_features)

572 X = X.astype(numpy.int32).astype(dtype)

573 y = data.target

574 X, y = shuffle(X, y, random_state=1)

575 itt = [('X', X[:1].astype(dtype))]

576 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype))

577

578

579def find_suitable_problem(model):

580 """

581 Determines problems suitable for a given

582 :epkg:`scikit-learn` operator. It may be

583

584 * `b-cl`: binary classification

585 * `m-cl`: m-cl classification

586 * `m-label`: classification m-label

587 (multiple labels possible at the same time)

588 * `reg`: regression

589 * `m-reg`: regression multi-output

590 * `num-tr`: transform numerical features

591 * `num-tr-pos`: transform numerical positive features

592 * `scoring`: transform numerical features, target is usually needed

593 * `outlier`: outlier prediction

594 * `linearsvc`: classifier without *predict_proba*

595 * `cluster`: similar to transform

596 * `num+y-tr`: similar to transform with targets

597 * `num+y-tr-cl`: similar to transform with classes

598 * `num-tr-clu`: similar to cluster, but returns

599 scores or distances instead of cluster

600 * `key-col`: list of dictionaries

601 * `text-col`: one column of text

602

603 Suffix `nofit` indicates the predictions happens

604 without the model being fitted. This is the case

605 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`.

606 The suffix `-cov` indicates the method `predict` was called

607 with parameter ``return_cov=True``, `-std` tells

608 method `predict` was called with parameter ``return_std=True``.

609 The suffix ``-NSV`` creates an input variable

610 like the following ``[('X', FloatTensorType([None, None]))]``.

611 That's a way to bypass :epkg:`onnxruntime` shape checking

612 as one part of the graph is designed to handle any

613 kind of dimensions but apparently, if the input shape is

614 precise, every part of the graph has to be precise. The strings

615 used variables which means it is at the same time precise

616 and unprecise. Suffix ``'-64'`` means the model will

617 do double computations. Suffix ``-nop`` means the classifier

618 does not implement method *predict_proba*. Suffix ``-1d``

619 means a one dimension problem (one feature). Suffix ``-dec``

620 checks method `decision_function`.

621

622 The following script gives the list of :epkg:`scikit-learn`

623 models and the problem they can be fitted on.

624

625 .. runpython::

626 :showcode:

627 :warningout: DeprecationWarning

628 :rst:

629

630 from mlprodict.onnxrt.validate.validate import (

631 sklearn_operators, find_suitable_problem)

632 from pyquickhelper.pandashelper import df2rst

633 from pandas import DataFrame

634 res = sklearn_operators()

635 rows = []

636 for model in res[:20]:

637 name = model['name']

638 row = dict(name=name)

639 try:

640 prob = find_suitable_problem(model['cl'])

641 if prob is None:

642 continue

643 for p in prob:

644 row[p] = 'X'

645 except RuntimeError:

646 pass

647 rows.append(row)

648 df = DataFrame(rows).set_index('name')

649 df = df.sort_index()

650 print(df2rst(df, index=True))

651

652 The list is truncated. The full list can be found at

653 :ref:`l-model-problem-list`.

654 """

655 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem

656

657 def _internal(model): # pylint: disable=R0911

658

659 # checks that this model is not overwritten by this module

660 ext = ext_find_suitable_problem(model)

661 if ext is not None:

662 return ext

663

664 # Exceptions

665 if model in {GaussianProcessRegressor}:

666 # m-reg causes MemoryError on some machine.

667 return ['~b-reg-NF-64', # '~m-reg-NF-64',

668 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64',

669 '~b-reg-NF-std-64', # '~m-reg-NF-std-64',

670 '~b-reg-NSV-64', # '~m-reg-NSV-64',

671 '~b-reg-cov-64', # '~m-reg-cov-64',

672 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64',

673 'b-reg', '~b-reg-64', # 'm-reg'

674 ]

675

676 if model in {DictVectorizer}:

677 return ['key-int-col']

678

679 if model in {TfidfVectorizer, CountVectorizer}:

680 return ['text-col']

681

682 if model in {TfidfTransformer}:

683 return ['bow']

684

685 if model in {FeatureHasher}:

686 return ['key-str-col']

687

688 if model in {OneHotEncoder}:

689 return ['one-hot']

690

691 if model in {LabelBinarizer, LabelEncoder}:

692 return ['int-col']

693

694 if model in {NuSVC, SVC, SGDClassifier,

695 HistGradientBoostingClassifier}:

696 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan']

697

698 if model in {GaussianProcessClassifier}:

699 return ['b-cl', 'm-cl', '~b-cl-64']

700

701 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV,

702 ComplementNB, GaussianNB,

703 GradientBoostingClassifier, LabelPropagation, LabelSpreading,

704 LinearDiscriminantAnalysis, LogisticRegressionCV,

705 MultinomialNB, QuadraticDiscriminantAnalysis,

706 RandomizedSearchCV}:

707 return ['b-cl', 'm-cl']

708

709 if model in {Perceptron}:

710 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec']

711

712 if model in {AdaBoostRegressor}:

713 return ['b-reg', '~b-reg-64']

714

715 if model in {HistGradientBoostingRegressor}:

716 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64']

717

718 if model in {LinearSVC, NearestCentroid}:

719 return ['~b-cl-nop', '~b-cl-nop-64']

720

721 if model in {RFE, RFECV}:

722 return ['num+y-tr']

723

724 if model in {GridSearchCV}:

725 return ['b-cl', 'm-cl',

726 'b-reg', 'm-reg',

727 '~b-reg-64', '~b-cl-64',

728 'cluster', 'outlier', '~m-label']

729

730 if model in {VotingClassifier}:

731 return ['b-cl', 'm-cl']

732

733 if StackingClassifier is not None and model in {StackingClassifier}:

734 return ['b-cl']

735

736 if StackingRegressor is not None and model in {StackingRegressor}:

737 return ['b-reg']

738

739 # specific scenarios

740 if model in {IsotonicRegression}:

741 return ['~num+y-tr-1d', '~b-reg-1d']

742

743 if model in {ARDRegression, BayesianRidge, ElasticNetCV,

744 GradientBoostingRegressor,

745 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,

746 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV,

747 PassiveAggressiveRegressor, SGDRegressor,

748 TheilSenRegressor, HuberRegressor, SVR}:

749 return ['b-reg', '~b-reg-64']

750

751 if model in {MultiOutputClassifier}:

752 return ['m-cl', '~m-label']

753

754 if model in {MultiOutputRegressor, MultiTaskElasticNet,

755 MultiTaskElasticNetCV, MultiTaskLassoCV,

756 MultiTaskLasso}:

757 return ['m-reg']

758

759 if model in {OneVsOneClassifier, OutputCodeClassifier,

760 PassiveAggressiveClassifier, RadiusNeighborsClassifier}:

761 return ['~b-cl-nop', '~m-cl-nop']

762

763 if model in {RidgeClassifier, RidgeClassifierCV}:

764 return ['~b-cl-nop', '~m-cl-nop', '~m-label']

765

766 # trainable transform

767 if model in {GenericUnivariateSelect,

768 NeighborhoodComponentsAnalysis,

769 PLSSVD, SelectKBest,

770 SelectPercentile, SelectFromModel}:

771 return ["num+y-tr"]

772

773 if model in {SelectFwe, SelectFdr, SelectFpr}:

774 return ["num+y-tr-cl"]

775

776 # no m-label

777 if model in {AdaBoostClassifier}:

778 return ['b-cl', '~b-cl-64', 'm-cl']

779

780 if model in {LogisticRegression}:

781 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec']

782

783 if model in {RandomForestClassifier}:

784 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']

785

786 if model in {DecisionTreeClassifier, ExtraTreeClassifier}:

787 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label']

788

789 if model in {DecisionTreeRegressor}:

790 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100']

791

792 if model in {LatentDirichletAllocation, NMF, PowerTransformer}:

793 return ['num-tr-pos']

794

795 if hasattr(model, 'predict'):

796 if "Classifier" in str(model):

797 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']

798 elif "Regressor" in str(model):

799 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']

800

801 # Generic case.

802 res = []

803 if hasattr(model, 'transform'):

804 if issubclass(model, (RegressorMixin, ClassifierMixin)):

805 res.extend(['num+y-tr'])

806 elif issubclass(model, (ClusterMixin, BiclusterMixin)):

807 res.extend(['~num-tr-clu', '~num-tr-clu-64'])

808 else:

809 res.extend(['num-tr'])

810

811 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)):

812 res.extend(['cluster', '~b-clu-64'])

813

814 if issubclass(model, (OutlierMixin)):

815 res.extend(['outlier'])

816

817 if issubclass(model, ClassifierMixin):

818 if model is OneVsRestClassifier:

819 return ['m-cl', '~m-label']

820 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label'])

821 if issubclass(model, RegressorMixin):

822 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'])

823 if issubclass(model, BaseMixture):

824 res.extend(['mix', '~mix-64'])

825

826 if len(res) > 0:

827 return res

828

829 raise RuntimeError("Unable to find problem for model '{}' - {}."

830 "".format(model.__name__, model.__bases__))

831

832 res = _internal(model)

833 for r in res:

834 if r not in _problems:

835 raise ValueError( # pragma: no cover

836 "Unrecognized problem '{}' in\n{}".format(

837 r, "\n".join(sorted(_problems))))

838 return res

839

840

841_problems = {

842 # standard

843 "b-cl": _problem_for_predictor_binary_classification,

844 "m-cl": _problem_for_predictor_multi_classification,

845 "b-reg": _problem_for_predictor_regression,

846 "m-reg": _problem_for_predictor_multi_regression,

847 "num-tr": _problem_for_numerical_transform,

848 "num-tr-pos": _problem_for_numerical_transform_positive,

849 'outlier': _problem_for_outlier,

850 'cluster': _problem_for_clustering,

851 'num+y-tr': _problem_for_numerical_trainable_transform,

852 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl,

853 'mix': _problem_for_mixture,

854 # others

855 '~num-tr-clu': _problem_for_clustering_scores,

856 "~m-label": _problem_for_predictor_multi_classification_label,

857 "~scoring": _problem_for_numerical_scoring,

858 '~b-cl-nop': _problem_for_clnoproba_binary,

859 '~m-cl-nop': _problem_for_clnoproba,

860 '~b-cl-dec': _problem_for_cl_decision_function_binary,

861 '~m-cl-dec': _problem_for_cl_decision_function,

862 # nan

863 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression(

864 n_features=n_features, add_nan=True),

865 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression(

866 dtype=numpy.float64, n_features=n_features, add_nan=True),

867 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification(

868 dtype=dtype, n_features=n_features, add_nan=True),

869 # 100 features

870 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression(

871 n_features=n_features or 100),

872 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification(

873 n_features=n_features or 100),

874 # 64

875 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification(

876 dtype=numpy.float64, n_features=n_features),

877 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression(

878 dtype=numpy.float64, n_features=n_features),

879 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba(

880 dtype=numpy.float64, n_features=n_features),

881 '~b-clu-64': lambda n_features=None: _problem_for_clustering(

882 dtype=numpy.float64, n_features=n_features),

883 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary(

884 dtype=numpy.float64, n_features=n_features),

885 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores(

886 dtype=numpy.float64, n_features=n_features),

887 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression(

888 dtype=numpy.float64, n_features=n_features),

889 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform(

890 dtype=numpy.float64, n_features=n_features),

891 '~mix-64': lambda n_features=None: _problem_for_mixture(

892 dtype=numpy.float64, n_features=n_features),

893 #

894 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification(

895 n_features=n_features) + (False, )),

896 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification(

897 n_features=n_features) + (False, )),

898 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression(

899 n_features=n_features) + (False, )),

900 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression(

901 n_features=n_features) + (False, )),

902 #

903 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification(

904 dtype=numpy.float64, n_features=n_features) + (False, )),

905 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification(

906 dtype=numpy.float64, n_features=n_features) + (False, )),

907 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression(

908 dtype=numpy.float64, n_features=n_features) + (False, )),

909 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

910 dtype=numpy.float64, n_features=n_features) + (False, )),

911 # GaussianProcess

912 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression(

913 True, options={GaussianProcessRegressor: {"return_cov": True}},

914 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),

915 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

916 True, options={GaussianProcessRegressor: {"return_cov": True}},

917 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),

918 #

919 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression(

920 True, options={GaussianProcessRegressor: {"return_std": True}},

921 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),

922 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

923 True, options={GaussianProcessRegressor: {"return_std": True}},

924 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),

925 #

926 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression(

927 True, options={GaussianProcessRegressor: {"return_cov": True}},

928 return_cov=True, dtype=numpy.float64, n_features=n_features)),

929 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

930 True, options={GaussianProcessRegressor: {"return_cov": True}},

931 return_cov=True, dtype=numpy.float64, n_features=n_features)),

932 #

933 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression(

934 True, options={GaussianProcessRegressor: {"return_std": True}},

935 return_std=True, dtype=numpy.float64, n_features=n_features)),

936 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

937 True, options={GaussianProcessRegressor: {"return_std": True}},

938 return_std=True, dtype=numpy.float64, n_features=n_features)),

939 #

940 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression(

941 dtype=numpy.float64, n_features=n_features)),

942 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(

943 dtype=numpy.float64, n_features=n_features)),

944 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression(

945 True, options={GaussianProcessRegressor: {"return_std": True}},

946 return_std=True, dtype=numpy.float64, n_features=n_features))),

947 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(

948 True, options={GaussianProcessRegressor: {"return_std": True}},

949 return_std=True, dtype=numpy.float64, n_features=n_features))),

950 # isotonic

951 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression),

952 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform),

953 # text

954 "key-int-col": _problem_for_dict_vectorizer,

955 "key-str-col": _problem_for_feature_hasher,

956 "int-col": _problem_for_label_encoder,

957 "one-hot": _problem_for_one_hot_encoder,

958 'text-col': _problem_for_tfidf_vectorizer,

959 'bow': _problem_for_tfidf_transformer,

960}

Coverage for mlprodict/onnxrt/validate/validate_problems.py : 99%

441 statements

Coverage for mlprodict/onnxrt/validate/validate_problems.py : 99%

441 statements 436 run 5 missing 5 excluded

441 statements