Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Validates runtime for many :scikit-learn: operators. 

4The submodule relies on :epkg:`onnxconverter_common`, 

5:epkg:`sklearn-onnx`. 

6""" 

7import numpy 

8from sklearn.base import ( 

9 ClusterMixin, BiclusterMixin, OutlierMixin, 

10 RegressorMixin, ClassifierMixin) 

11from sklearn.calibration import CalibratedClassifierCV 

12from sklearn.cross_decomposition import PLSSVD 

13from sklearn.datasets import load_iris 

14from sklearn.decomposition import LatentDirichletAllocation, NMF 

15from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 

16from sklearn.ensemble import ( 

17 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier, 

18 BaggingClassifier, VotingClassifier, GradientBoostingClassifier, 

19 RandomForestClassifier) 

20try: 

21 from sklearn.ensemble import StackingClassifier, StackingRegressor 

22except ImportError: # pragma: no cover 

23 # new in 0.22 

24 StackingClassifier, StackingRegressor = None, None 

25from sklearn.feature_extraction import DictVectorizer, FeatureHasher 

26from sklearn.feature_extraction.text import ( 

27 CountVectorizer, TfidfVectorizer, TfidfTransformer) 

28from sklearn.experimental import enable_hist_gradient_boosting # pylint: disable=W0611 

29from sklearn.ensemble import ( 

30 HistGradientBoostingRegressor, 

31 HistGradientBoostingClassifier) 

32from sklearn.feature_selection import ( 

33 RFE, RFECV, GenericUnivariateSelect, 

34 SelectPercentile, SelectFwe, SelectKBest, 

35 SelectFdr, SelectFpr, SelectFromModel) 

36from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor 

37from sklearn.isotonic import IsotonicRegression 

38from sklearn.linear_model import ( 

39 ARDRegression, ElasticNetCV, 

40 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC, 

41 SGDRegressor, OrthogonalMatchingPursuitCV, 

42 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet, 

43 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso, 

44 PassiveAggressiveClassifier, RidgeClassifier, 

45 RidgeClassifierCV, PassiveAggressiveRegressor, 

46 HuberRegressor, LogisticRegression, SGDClassifier, 

47 LogisticRegressionCV, Perceptron) 

48from sklearn.mixture._base import BaseMixture 

49from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

50from sklearn.multiclass import ( 

51 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier) 

52from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier 

53from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB 

54from sklearn.neighbors import ( 

55 NearestCentroid, RadiusNeighborsClassifier, 

56 NeighborhoodComponentsAnalysis) 

57from sklearn.preprocessing import ( 

58 LabelBinarizer, LabelEncoder, 

59 OneHotEncoder, PowerTransformer) 

60from sklearn.semi_supervised import LabelPropagation, LabelSpreading 

61from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC 

62from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier 

63from sklearn.utils import shuffle 

64from skl2onnx.common.data_types import ( 

65 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType) 

66from ._validate_problems_helper import ( 

67 _noshapevar, _1d_problem, text_alpha_num) 

68 

69 

70def _modify_dimension(X, n_features, seed=19): 

71 """ 

72 Modifies the number of features to increase 

73 or reduce the number of features. 

74 

75 @param X features matrix 

76 @param n_features number of features 

77 @param seed random seed (to get the same 

78 dataset at each call) 

79 @return new featurs matrix 

80 """ 

81 if n_features is None or n_features == X.shape[1]: 

82 return X 

83 if n_features < X.shape[1]: 

84 return X[:, :n_features] 

85 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101 

86 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype) 

87 res[:, :X.shape[1]] = X[:, :] 

88 div = max((n_features // X.shape[1]) + 1, 2) 

89 for i in range(X.shape[1], res.shape[1]): 

90 j = i % X.shape[1] 

91 col = X[:, j] 

92 if X.dtype in (numpy.float32, numpy.float64): 

93 sigma = numpy.var(col) ** 0.5 

94 rnd = rstate.randn(len(col)) * sigma / div 

95 col2 = col + rnd 

96 res[:, j] -= col2 / div 

97 res[:, i] = col2 

98 elif X.dtype in (numpy.int32, numpy.int64): 

99 perm = rstate.permutation(col) 

100 h = rstate.randint(0, div) % X.shape[0] 

101 col2 = col.copy() 

102 col2[h::div] = perm[h::div] # pylint: disable=E1136 

103 res[:, i] = col2 

104 h = (h + 1) % X.shape[0] 

105 res[h, j] = perm[h] # pylint: disable=E1136 

106 else: # pragma: no cover 

107 raise NotImplementedError( # pragma: no cover 

108 "Unable to add noise to a feature for this type {}".format(X.dtype)) 

109 return res 

110 

111 

112########### 

113# datasets 

114########### 

115 

116 

117def _problem_for_predictor_binary_classification( 

118 dtype=numpy.float32, n_features=None, add_nan=False): 

119 """ 

120 Returns *X, y, intial_types, method, node name, X runtime* for a 

121 binary classification problem. 

122 It is based on Iris dataset. 

123 """ 

124 data = load_iris() 

125 X = data.data 

126 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

127 rnd = state.randn(*X.shape) / 3 

128 X += rnd 

129 X = _modify_dimension(X, n_features) 

130 y = data.target 

131 y[y == 2] = 1 

132 if add_nan: 

133 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

134 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

135 X[rows, cols] = numpy.nan 

136 X = X.astype(dtype) 

137 y = y.astype(numpy.int64) 

138 return (X, y, [('X', X[:1].astype(dtype))], 

139 'predict_proba', 1, X.astype(dtype)) 

140 

141 

142def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None): 

143 """ 

144 Returns *X, y, intial_types, method, node name, X runtime* for a 

145 m-cl classification problem. 

146 It is based on Iris dataset. 

147 """ 

148 data = load_iris() 

149 X = data.data 

150 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

151 rnd = state.randn(*X.shape) / 3 

152 X += rnd 

153 X = _modify_dimension(X, n_features) 

154 y = data.target 

155 X = X.astype(dtype) 

156 y = y.astype(numpy.int64) 

157 return (X, y, [('X', X[:1].astype(dtype))], 

158 'predict_proba', 1, X.astype(dtype)) 

159 

160 

161def _problem_for_mixture(dtype=numpy.float32, n_features=None): 

162 """ 

163 Returns *X, y, intial_types, method, node name, X runtime* for a 

164 m-cl classification problem. 

165 It is based on Iris dataset. 

166 """ 

167 data = load_iris() 

168 X = data.data 

169 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

170 rnd = state.randn(*X.shape) / 3 

171 X += rnd 

172 X = _modify_dimension(X, n_features) 

173 y = data.target 

174 X = X.astype(dtype) 

175 y = y.astype(numpy.int64) 

176 return (X, None, [('X', X[:1].astype(dtype))], 

177 'predict_proba', 1, X.astype(dtype)) 

178 

179 

180def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None): 

181 """ 

182 Returns *X, y, intial_types, method, node name, X runtime* for a 

183 m-cl classification problem. 

184 It is based on Iris dataset. 

185 """ 

186 data = load_iris() 

187 X = data.data 

188 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

189 rnd = state.randn(*X.shape) / 3 

190 X += rnd 

191 X = _modify_dimension(X, n_features) 

192 y = data.target 

193 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64) 

194 for i, _ in enumerate(y): 

195 y2[i, _] = 1 

196 for i in range(0, y.shape[0], 5): 

197 y2[i, (y[i] + 1) % 3] = 1 

198 X = X.astype(dtype) 

199 y2 = y2.astype(numpy.int64) 

200 return (X, y2, [('X', X[:1].astype(dtype))], 

201 'predict_proba', 1, X.astype(dtype)) 

202 

203 

204def _problem_for_predictor_regression(many_output=False, options=None, 

205 n_features=None, nbrows=None, 

206 dtype=numpy.float32, add_nan=False, 

207 **kwargs): 

208 """ 

209 Returns *X, y, intial_types, method, name, X runtime* for a 

210 regression problem. 

211 It is based on Iris dataset. 

212 """ 

213 data = load_iris() 

214 X = data.data 

215 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

216 rnd = state.randn(*X.shape) / 3 

217 X += rnd 

218 X = _modify_dimension(X, n_features) 

219 y = data.target + numpy.arange(len(data.target)) / 100 

220 meth = 'predict' if kwargs is None else ('predict', kwargs) 

221 itt = [('X', X[:1].astype(dtype))] 

222 if n_features is not None: 

223 X = X[:, :n_features] 

224 itt = [('X', X[:1].astype(dtype))] 

225 if nbrows is not None: 

226 X = X[:nbrows, :] 

227 y = y[:nbrows] 

228 itt = [('X', X[:1].astype(dtype))] 

229 if options is not None: 

230 itt = itt, options 

231 if add_nan: 

232 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

233 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

234 X[rows, cols] = numpy.nan 

235 X = X.astype(dtype) 

236 y = y.astype(dtype) 

237 return (X, y, itt, 

238 meth, 'all' if many_output else 0, X.astype(dtype)) 

239 

240 

241def _problem_for_predictor_multi_regression(many_output=False, options=None, 

242 n_features=None, nbrows=None, 

243 dtype=numpy.float32, **kwargs): 

244 """ 

245 Returns *X, y, intial_types, method, name, X runtime* for a 

246 mregression problem. 

247 It is based on Iris dataset. 

248 """ 

249 data = load_iris() 

250 X = data.data 

251 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

252 rnd = state.randn(*X.shape) / 3 

253 X += rnd 

254 X = _modify_dimension(X, n_features) 

255 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100 

256 meth = 'predict' if kwargs is None else ('predict', kwargs) 

257 itt = [('X', X[:1].astype(dtype))] 

258 if n_features is not None: 

259 X = X[:, :n_features] 

260 itt = [('X', X[:1].astype(dtype))] 

261 if nbrows is not None: 

262 X = X[:nbrows, :] 

263 y = y[:nbrows] 

264 itt = [('X', X[:1].astype(dtype))] 

265 if options is not None: 

266 itt = itt, options 

267 y2 = numpy.empty((y.shape[0], 2)) 

268 y2[:, 0] = y 

269 y2[:, 1] = y + 0.5 

270 X = X.astype(dtype) 

271 y2 = y2.astype(dtype) 

272 return (X, y2, itt, 

273 meth, 'all' if many_output else 0, X.astype(dtype)) 

274 

275 

276def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None): 

277 """ 

278 Returns *X, intial_types, method, name, X runtime* for a 

279 transformation problem. 

280 It is based on Iris dataset. 

281 """ 

282 data = load_iris() 

283 X = data.data 

284 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

285 rnd = state.randn(*X.shape) / 3 

286 X += rnd 

287 X = _modify_dimension(X, n_features) 

288 X = X.astype(dtype) 

289 return (X, None, [('X', X[:1].astype(dtype))], 

290 'transform', 0, X.astype(dtype=numpy.float32)) 

291 

292 

293def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None): 

294 """ 

295 Returns *X, intial_types, method, name, X runtime* for a 

296 transformation problem. 

297 It is based on Iris dataset. 

298 """ 

299 data = load_iris() 

300 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

301 rnd = state.randn(*data.data.shape) / 3 

302 X = numpy.abs(data.data + rnd) 

303 X = _modify_dimension(X, n_features) 

304 X = X.astype(dtype) 

305 return (X, None, [('X', X[:1].astype(dtype))], 

306 'transform', 0, X.astype(dtype=numpy.float32)) 

307 

308 

309def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None): 

310 """ 

311 Returns *X, intial_types, method, name, X runtime* for a 

312 transformation problem. 

313 It is based on Iris dataset. 

314 """ 

315 data = load_iris() 

316 X = data.data 

317 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

318 rnd = state.randn(*X.shape) / 3 

319 X += rnd 

320 X = _modify_dimension(X, n_features) 

321 y = data.target + numpy.arange(len(data.target)) / 100 

322 X = X.astype(dtype) 

323 y = y.astype(dtype) 

324 return (X, y, [('X', X[:1].astype(dtype))], 

325 'transform', 0, X.astype(dtype)) 

326 

327 

328def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None): 

329 """ 

330 Returns *X, intial_types, method, name, X runtime* for a 

331 transformation problem. 

332 It is based on Iris dataset. 

333 """ 

334 data = load_iris() 

335 X = data.data 

336 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

337 rnd = state.randn(*X.shape) / 3 

338 X += rnd 

339 X = _modify_dimension(X, n_features) 

340 y = data.target 

341 X = X.astype(dtype) 

342 y = y.astype(numpy.int64) 

343 return (X, y, [('X', X[:1].astype(dtype))], 

344 'transform', 0, X.astype(dtype)) 

345 

346 

347def _problem_for_clustering(dtype=numpy.float32, n_features=None): 

348 """ 

349 Returns *X, intial_types, method, name, X runtime* for a 

350 clustering problem. 

351 It is based on Iris dataset. 

352 """ 

353 data = load_iris() 

354 X = data.data 

355 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

356 rnd = state.randn(*X.shape) / 3 

357 X += rnd 

358 X = _modify_dimension(X, n_features) 

359 X = X.astype(dtype) 

360 return (X, None, [('X', X[:1].astype(dtype))], 

361 'predict', 0, X.astype(dtype)) 

362 

363 

364def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None): 

365 """ 

366 Returns *X, intial_types, method, name, X runtime* for a 

367 clustering problem, the score part, not the cluster. 

368 It is based on Iris dataset. 

369 """ 

370 data = load_iris() 

371 X = data.data 

372 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

373 rnd = state.randn(*X.shape) / 3 

374 X += rnd 

375 X = _modify_dimension(X, n_features) 

376 X = X.astype(dtype) 

377 return (X, None, [('X', X[:1].astype(dtype))], 

378 'transform', 1, X.astype(dtype)) 

379 

380 

381def _problem_for_outlier(dtype=numpy.float32, n_features=None): 

382 """ 

383 Returns *X, intial_types, method, name, X runtime* for a 

384 transformation problem. 

385 It is based on Iris dataset. 

386 """ 

387 data = load_iris() 

388 X = data.data 

389 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

390 rnd = state.randn(*X.shape) / 3 

391 X += rnd 

392 X = _modify_dimension(X, n_features) 

393 X = X.astype(dtype) 

394 return (X, None, [('X', X[:1].astype(dtype))], 

395 'predict', 0, X.astype(dtype)) 

396 

397 

398def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None): 

399 """ 

400 Returns *X, y, intial_types, method, name, X runtime* for a 

401 scoring problem. 

402 It is based on Iris dataset. 

403 """ 

404 data = load_iris() 

405 X = data.data 

406 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

407 rnd = state.randn(*X.shape) / 3 

408 X += rnd 

409 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100 

410 y /= numpy.max(y) 

411 X = X.astype(dtype) 

412 y = y.astype(dtype) 

413 return (X, y, [('X', X[:1].astype(dtype))], 

414 'score', 0, X.astype(dtype)) 

415 

416 

417def _problem_for_clnoproba(dtype=numpy.float32, n_features=None): 

418 """ 

419 Returns *X, y, intial_types, method, name, X runtime* for a 

420 scoring problem. 

421 It is based on Iris dataset. 

422 """ 

423 data = load_iris() 

424 X = data.data 

425 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

426 rnd = state.randn(*X.shape) / 3 

427 X += rnd 

428 X = _modify_dimension(X, n_features) 

429 y = data.target 

430 X = X.astype(dtype) 

431 y = y.astype(numpy.int64) 

432 return (X, y, [('X', X[:1].astype(dtype))], 

433 'predict', 0, X.astype(dtype)) 

434 

435 

436def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False): 

437 """ 

438 Returns *X, y, intial_types, method, name, X runtime* for a 

439 scoring problem. Binary classification. 

440 It is based on Iris dataset. 

441 """ 

442 data = load_iris() 

443 X = data.data 

444 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

445 rnd = state.randn(*X.shape) / 3 

446 X += rnd 

447 X = _modify_dimension(X, n_features) 

448 y = data.target 

449 y[y == 2] = 1 

450 if add_nan: 

451 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

452 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

453 X[rows, cols] = numpy.nan 

454 X = X.astype(dtype) 

455 y = y.astype(numpy.int64) 

456 return (X, y, [('X', X[:1].astype(dtype))], 

457 'predict', 0, X.astype(dtype)) 

458 

459 

460def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None): 

461 """ 

462 Returns *X, y, intial_types, method, name, X runtime* for a 

463 scoring problem. 

464 It is based on Iris dataset. 

465 """ 

466 data = load_iris() 

467 X = data.data 

468 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

469 rnd = state.randn(*X.shape) / 3 

470 X += rnd 

471 X = _modify_dimension(X, n_features) 

472 y = data.target 

473 X = X.astype(dtype) 

474 y = y.astype(numpy.int64) 

475 return (X, y, [('X', X[:1].astype(dtype))], 

476 'decision_function', 1, X.astype(dtype)) 

477 

478 

479def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None): 

480 """ 

481 Returns *X, y, intial_types, method, name, X runtime* for a 

482 scoring problem. Binary classification. 

483 It is based on Iris dataset. 

484 """ 

485 data = load_iris() 

486 X = data.data 

487 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

488 rnd = state.randn(*X.shape) / 3 

489 X += rnd 

490 X = _modify_dimension(X, n_features) 

491 y = data.target 

492 y[y == 2] = 1 

493 X = X.astype(dtype) 

494 y = y.astype(numpy.int64) 

495 return (X, y, [('X', X[:1].astype(dtype))], 

496 'decision_function', 1, X.astype(dtype)) 

497 

498 

499def _problem_for_label_encoder(dtype=numpy.int64, n_features=None): 

500 """ 

501 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`. 

502 """ 

503 data = load_iris() 

504 # X = data.data 

505 y = data.target.astype(dtype) 

506 itt = [('X', y[:1].astype(dtype))] 

507 y = y.astype(dtype) 

508 return (y, None, itt, 'transform', 0, y) 

509 

510 

511def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None): 

512 """ 

513 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`. 

514 """ 

515 data = load_iris() 

516 # X = data.data 

517 y = data.target 

518 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)] 

519 y2[0][2] = -2 

520 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

521 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))] 

522 y2 = numpy.array(y2) 

523 y = y.astype(numpy.int64) 

524 return (y2, y, itt, 'transform', 0, y2) 

525 

526 

527def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None): 

528 """ 

529 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`. 

530 """ 

531 X = numpy.array([_[0] for _ in text_alpha_num]) 

532 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) 

533 itt = [("X", StringTensorType([None]))] 

534 return (X, y, itt, 'transform', 0, X) 

535 

536 

537def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None): 

538 """ 

539 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`. 

540 """ 

541 X = numpy.array([_[0] for _ in text_alpha_num]) 

542 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) 

543 X2 = CountVectorizer().fit_transform(X).astype(dtype) 

544 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

545 itt = [("X", cltype([None, X2.shape[1]]))] 

546 return (X2, y, itt, 'transform', 0, X2) 

547 

548 

549def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None): 

550 """ 

551 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`. 

552 """ 

553 data = load_iris() 

554 # X = data.data 

555 y = data.target 

556 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)] 

557 y2[0]["cl2"] = -2 

558 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

559 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))] 

560 y2 = numpy.array(y2) 

561 return (y2, y, itt, 'transform', 0, y2) 

562 

563 

564def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None): 

565 """ 

566 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`. 

567 """ 

568 data = load_iris() 

569 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

570 rnd = state.randn(*data.data.shape) / 3 

571 X = _modify_dimension(data.data + rnd, n_features) 

572 X = X.astype(numpy.int32).astype(dtype) 

573 y = data.target 

574 X, y = shuffle(X, y, random_state=1) 

575 itt = [('X', X[:1].astype(dtype))] 

576 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype)) 

577 

578 

579def find_suitable_problem(model): 

580 """ 

581 Determines problems suitable for a given 

582 :epkg:`scikit-learn` operator. It may be 

583 

584 * `b-cl`: binary classification 

585 * `m-cl`: m-cl classification 

586 * `m-label`: classification m-label 

587 (multiple labels possible at the same time) 

588 * `reg`: regression 

589 * `m-reg`: regression multi-output 

590 * `num-tr`: transform numerical features 

591 * `num-tr-pos`: transform numerical positive features 

592 * `scoring`: transform numerical features, target is usually needed 

593 * `outlier`: outlier prediction 

594 * `linearsvc`: classifier without *predict_proba* 

595 * `cluster`: similar to transform 

596 * `num+y-tr`: similar to transform with targets 

597 * `num+y-tr-cl`: similar to transform with classes 

598 * `num-tr-clu`: similar to cluster, but returns 

599 scores or distances instead of cluster 

600 * `key-col`: list of dictionaries 

601 * `text-col`: one column of text 

602 

603 Suffix `nofit` indicates the predictions happens 

604 without the model being fitted. This is the case 

605 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`. 

606 The suffix `-cov` indicates the method `predict` was called 

607 with parameter ``return_cov=True``, `-std` tells 

608 method `predict` was called with parameter ``return_std=True``. 

609 The suffix ``-NSV`` creates an input variable 

610 like the following ``[('X', FloatTensorType([None, None]))]``. 

611 That's a way to bypass :epkg:`onnxruntime` shape checking 

612 as one part of the graph is designed to handle any 

613 kind of dimensions but apparently, if the input shape is 

614 precise, every part of the graph has to be precise. The strings 

615 used variables which means it is at the same time precise 

616 and unprecise. Suffix ``'-64'`` means the model will 

617 do double computations. Suffix ``-nop`` means the classifier 

618 does not implement method *predict_proba*. Suffix ``-1d`` 

619 means a one dimension problem (one feature). Suffix ``-dec`` 

620 checks method `decision_function`. 

621 

622 The following script gives the list of :epkg:`scikit-learn` 

623 models and the problem they can be fitted on. 

624 

625 .. runpython:: 

626 :showcode: 

627 :warningout: DeprecationWarning 

628 :rst: 

629 

630 from mlprodict.onnxrt.validate.validate import ( 

631 sklearn_operators, find_suitable_problem) 

632 from pyquickhelper.pandashelper import df2rst 

633 from pandas import DataFrame 

634 res = sklearn_operators() 

635 rows = [] 

636 for model in res[:20]: 

637 name = model['name'] 

638 row = dict(name=name) 

639 try: 

640 prob = find_suitable_problem(model['cl']) 

641 if prob is None: 

642 continue 

643 for p in prob: 

644 row[p] = 'X' 

645 except RuntimeError: 

646 pass 

647 rows.append(row) 

648 df = DataFrame(rows).set_index('name') 

649 df = df.sort_index() 

650 print(df2rst(df, index=True)) 

651 

652 The list is truncated. The full list can be found at 

653 :ref:`l-model-problem-list`. 

654 """ 

655 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem 

656 

657 def _internal(model): # pylint: disable=R0911 

658 

659 # checks that this model is not overwritten by this module 

660 ext = ext_find_suitable_problem(model) 

661 if ext is not None: 

662 return ext 

663 

664 # Exceptions 

665 if model in {GaussianProcessRegressor}: 

666 # m-reg causes MemoryError on some machine. 

667 return ['~b-reg-NF-64', # '~m-reg-NF-64', 

668 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64', 

669 '~b-reg-NF-std-64', # '~m-reg-NF-std-64', 

670 '~b-reg-NSV-64', # '~m-reg-NSV-64', 

671 '~b-reg-cov-64', # '~m-reg-cov-64', 

672 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64', 

673 'b-reg', '~b-reg-64', # 'm-reg' 

674 ] 

675 

676 if model in {DictVectorizer}: 

677 return ['key-int-col'] 

678 

679 if model in {TfidfVectorizer, CountVectorizer}: 

680 return ['text-col'] 

681 

682 if model in {TfidfTransformer}: 

683 return ['bow'] 

684 

685 if model in {FeatureHasher}: 

686 return ['key-str-col'] 

687 

688 if model in {OneHotEncoder}: 

689 return ['one-hot'] 

690 

691 if model in {LabelBinarizer, LabelEncoder}: 

692 return ['int-col'] 

693 

694 if model in {NuSVC, SVC, SGDClassifier, 

695 HistGradientBoostingClassifier}: 

696 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan'] 

697 

698 if model in {GaussianProcessClassifier}: 

699 return ['b-cl', 'm-cl', '~b-cl-64'] 

700 

701 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV, 

702 ComplementNB, GaussianNB, 

703 GradientBoostingClassifier, LabelPropagation, LabelSpreading, 

704 LinearDiscriminantAnalysis, LogisticRegressionCV, 

705 MultinomialNB, QuadraticDiscriminantAnalysis, 

706 RandomizedSearchCV}: 

707 return ['b-cl', 'm-cl'] 

708 

709 if model in {Perceptron}: 

710 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec'] 

711 

712 if model in {AdaBoostRegressor}: 

713 return ['b-reg', '~b-reg-64'] 

714 

715 if model in {HistGradientBoostingRegressor}: 

716 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64'] 

717 

718 if model in {LinearSVC, NearestCentroid}: 

719 return ['~b-cl-nop', '~b-cl-nop-64'] 

720 

721 if model in {RFE, RFECV}: 

722 return ['num+y-tr'] 

723 

724 if model in {GridSearchCV}: 

725 return ['b-cl', 'm-cl', 

726 'b-reg', 'm-reg', 

727 '~b-reg-64', '~b-cl-64', 

728 'cluster', 'outlier', '~m-label'] 

729 

730 if model in {VotingClassifier}: 

731 return ['b-cl', 'm-cl'] 

732 

733 if StackingClassifier is not None and model in {StackingClassifier}: 

734 return ['b-cl'] 

735 

736 if StackingRegressor is not None and model in {StackingRegressor}: 

737 return ['b-reg'] 

738 

739 # specific scenarios 

740 if model in {IsotonicRegression}: 

741 return ['~num+y-tr-1d', '~b-reg-1d'] 

742 

743 if model in {ARDRegression, BayesianRidge, ElasticNetCV, 

744 GradientBoostingRegressor, 

745 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC, 

746 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV, 

747 PassiveAggressiveRegressor, SGDRegressor, 

748 TheilSenRegressor, HuberRegressor, SVR}: 

749 return ['b-reg', '~b-reg-64'] 

750 

751 if model in {MultiOutputClassifier}: 

752 return ['m-cl', '~m-label'] 

753 

754 if model in {MultiOutputRegressor, MultiTaskElasticNet, 

755 MultiTaskElasticNetCV, MultiTaskLassoCV, 

756 MultiTaskLasso}: 

757 return ['m-reg'] 

758 

759 if model in {OneVsOneClassifier, OutputCodeClassifier, 

760 PassiveAggressiveClassifier, RadiusNeighborsClassifier}: 

761 return ['~b-cl-nop', '~m-cl-nop'] 

762 

763 if model in {RidgeClassifier, RidgeClassifierCV}: 

764 return ['~b-cl-nop', '~m-cl-nop', '~m-label'] 

765 

766 # trainable transform 

767 if model in {GenericUnivariateSelect, 

768 NeighborhoodComponentsAnalysis, 

769 PLSSVD, SelectKBest, 

770 SelectPercentile, SelectFromModel}: 

771 return ["num+y-tr"] 

772 

773 if model in {SelectFwe, SelectFdr, SelectFpr}: 

774 return ["num+y-tr-cl"] 

775 

776 # no m-label 

777 if model in {AdaBoostClassifier}: 

778 return ['b-cl', '~b-cl-64', 'm-cl'] 

779 

780 if model in {LogisticRegression}: 

781 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec'] 

782 

783 if model in {RandomForestClassifier}: 

784 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label'] 

785 

786 if model in {DecisionTreeClassifier, ExtraTreeClassifier}: 

787 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label'] 

788 

789 if model in {DecisionTreeRegressor}: 

790 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100'] 

791 

792 if model in {LatentDirichletAllocation, NMF, PowerTransformer}: 

793 return ['num-tr-pos'] 

794 

795 if hasattr(model, 'predict'): 

796 if "Classifier" in str(model): 

797 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label'] 

798 elif "Regressor" in str(model): 

799 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'] 

800 

801 # Generic case. 

802 res = [] 

803 if hasattr(model, 'transform'): 

804 if issubclass(model, (RegressorMixin, ClassifierMixin)): 

805 res.extend(['num+y-tr']) 

806 elif issubclass(model, (ClusterMixin, BiclusterMixin)): 

807 res.extend(['~num-tr-clu', '~num-tr-clu-64']) 

808 else: 

809 res.extend(['num-tr']) 

810 

811 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)): 

812 res.extend(['cluster', '~b-clu-64']) 

813 

814 if issubclass(model, (OutlierMixin)): 

815 res.extend(['outlier']) 

816 

817 if issubclass(model, ClassifierMixin): 

818 if model is OneVsRestClassifier: 

819 return ['m-cl', '~m-label'] 

820 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label']) 

821 if issubclass(model, RegressorMixin): 

822 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']) 

823 if issubclass(model, BaseMixture): 

824 res.extend(['mix', '~mix-64']) 

825 

826 if len(res) > 0: 

827 return res 

828 

829 raise RuntimeError("Unable to find problem for model '{}' - {}." 

830 "".format(model.__name__, model.__bases__)) 

831 

832 res = _internal(model) 

833 for r in res: 

834 if r not in _problems: 

835 raise ValueError( # pragma: no cover 

836 "Unrecognized problem '{}' in\n{}".format( 

837 r, "\n".join(sorted(_problems)))) 

838 return res 

839 

840 

841_problems = { 

842 # standard 

843 "b-cl": _problem_for_predictor_binary_classification, 

844 "m-cl": _problem_for_predictor_multi_classification, 

845 "b-reg": _problem_for_predictor_regression, 

846 "m-reg": _problem_for_predictor_multi_regression, 

847 "num-tr": _problem_for_numerical_transform, 

848 "num-tr-pos": _problem_for_numerical_transform_positive, 

849 'outlier': _problem_for_outlier, 

850 'cluster': _problem_for_clustering, 

851 'num+y-tr': _problem_for_numerical_trainable_transform, 

852 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl, 

853 'mix': _problem_for_mixture, 

854 # others 

855 '~num-tr-clu': _problem_for_clustering_scores, 

856 "~m-label": _problem_for_predictor_multi_classification_label, 

857 "~scoring": _problem_for_numerical_scoring, 

858 '~b-cl-nop': _problem_for_clnoproba_binary, 

859 '~m-cl-nop': _problem_for_clnoproba, 

860 '~b-cl-dec': _problem_for_cl_decision_function_binary, 

861 '~m-cl-dec': _problem_for_cl_decision_function, 

862 # nan 

863 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression( 

864 n_features=n_features, add_nan=True), 

865 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression( 

866 dtype=numpy.float64, n_features=n_features, add_nan=True), 

867 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification( 

868 dtype=dtype, n_features=n_features, add_nan=True), 

869 # 100 features 

870 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression( 

871 n_features=n_features or 100), 

872 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification( 

873 n_features=n_features or 100), 

874 # 64 

875 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification( 

876 dtype=numpy.float64, n_features=n_features), 

877 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression( 

878 dtype=numpy.float64, n_features=n_features), 

879 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba( 

880 dtype=numpy.float64, n_features=n_features), 

881 '~b-clu-64': lambda n_features=None: _problem_for_clustering( 

882 dtype=numpy.float64, n_features=n_features), 

883 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary( 

884 dtype=numpy.float64, n_features=n_features), 

885 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores( 

886 dtype=numpy.float64, n_features=n_features), 

887 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression( 

888 dtype=numpy.float64, n_features=n_features), 

889 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform( 

890 dtype=numpy.float64, n_features=n_features), 

891 '~mix-64': lambda n_features=None: _problem_for_mixture( 

892 dtype=numpy.float64, n_features=n_features), 

893 # 

894 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification( 

895 n_features=n_features) + (False, )), 

896 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification( 

897 n_features=n_features) + (False, )), 

898 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression( 

899 n_features=n_features) + (False, )), 

900 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression( 

901 n_features=n_features) + (False, )), 

902 # 

903 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification( 

904 dtype=numpy.float64, n_features=n_features) + (False, )), 

905 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification( 

906 dtype=numpy.float64, n_features=n_features) + (False, )), 

907 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression( 

908 dtype=numpy.float64, n_features=n_features) + (False, )), 

909 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

910 dtype=numpy.float64, n_features=n_features) + (False, )), 

911 # GaussianProcess 

912 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression( 

913 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

914 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

915 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

916 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

917 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

918 # 

919 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression( 

920 True, options={GaussianProcessRegressor: {"return_std": True}}, 

921 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

922 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

923 True, options={GaussianProcessRegressor: {"return_std": True}}, 

924 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

925 # 

926 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression( 

927 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

928 return_cov=True, dtype=numpy.float64, n_features=n_features)), 

929 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

930 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

931 return_cov=True, dtype=numpy.float64, n_features=n_features)), 

932 # 

933 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression( 

934 True, options={GaussianProcessRegressor: {"return_std": True}}, 

935 return_std=True, dtype=numpy.float64, n_features=n_features)), 

936 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

937 True, options={GaussianProcessRegressor: {"return_std": True}}, 

938 return_std=True, dtype=numpy.float64, n_features=n_features)), 

939 # 

940 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression( 

941 dtype=numpy.float64, n_features=n_features)), 

942 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression( 

943 dtype=numpy.float64, n_features=n_features)), 

944 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression( 

945 True, options={GaussianProcessRegressor: {"return_std": True}}, 

946 return_std=True, dtype=numpy.float64, n_features=n_features))), 

947 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression( 

948 True, options={GaussianProcessRegressor: {"return_std": True}}, 

949 return_std=True, dtype=numpy.float64, n_features=n_features))), 

950 # isotonic 

951 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression), 

952 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform), 

953 # text 

954 "key-int-col": _problem_for_dict_vectorizer, 

955 "key-str-col": _problem_for_feature_hasher, 

956 "int-col": _problem_for_label_encoder, 

957 "one-hot": _problem_for_one_hot_encoder, 

958 'text-col': _problem_for_tfidf_vectorizer, 

959 'bow': _problem_for_tfidf_transformer, 

960}