Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- encoding: utf-8 -*- 

2""" 

3@file 

4@brief Overloads a conversion function. 

5""" 

6import pprint 

7from collections import OrderedDict 

8import numpy 

9import pandas 

10try: 

11 from sklearn.metrics._scorer import _PredictScorer 

12except ImportError: # pragma: no cover 

13 # scikit-learn < 0.22 

14 from sklearn.metrics.scorer import _PredictScorer 

15from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version 

16from skl2onnx.common.data_types import ( 

17 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type, 

18 StringTensorType, Int64TensorType) 

19from skl2onnx import convert_sklearn 

20from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin 

21from skl2onnx.algebra.type_helper import _guess_type 

22from ..onnx_tools.onnx_manipulations import onnx_rename_names 

23from .register_rewritten_converters import register_rewritten_operators 

24from .register import register_converters 

25from .scorers import CustomScorerTransform 

26 

27 

28def convert_scorer(fct, initial_types, name=None, 

29 target_opset=None, options=None, 

30 custom_conversion_functions=None, 

31 custom_shape_calculators=None, 

32 custom_parsers=None, white_op=None, 

33 black_op=None, final_types=None, 

34 verbose=0): 

35 """ 

36 Converts a scorer into :epkg:`ONNX` assuming 

37 there exists a converter associated to it. 

38 The function wraps the function into a custom 

39 transformer, then calls function *convert_sklearn* 

40 from :epkg:`sklearn-onnx`. 

41 

42 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`) 

43 :param initial_types: types information 

44 :param name: name of the produced model 

45 :param target_opset: to do it with a different target opset 

46 :param options: additional parameters for the conversion 

47 :param custom_conversion_functions: a dictionary for specifying the user 

48 customized conversion function, it takes precedence over 

49 registered converters 

50 :param custom_shape_calculators: a dictionary for specifying the user 

51 customized shape calculator it takes precedence over registered 

52 shape calculators. 

53 :param custom_parsers: parsers determine which outputs is expected 

54 for which particular task, default parsers are 

55 defined for classifiers, regressors, pipeline but 

56 they can be rewritten, *custom_parsers* is a dictionary 

57 ``{ type: fct_parser(scope, model, inputs, 

58 custom_parsers=None) }`` 

59 :param white_op: white list of ONNX nodes allowed 

60 while converting a pipeline, if empty, all are allowed 

61 :param black_op: black list of ONNX nodes allowed 

62 while converting a pipeline, if empty, none are blacklisted 

63 :param final_types: a python list. Works the same way as 

64 initial_types but not mandatory, it is used 

65 to overwrites the type (if type is not None) 

66 and the name of every output. 

67 :param verbose: displays information while converting 

68 :return: :epkg:`ONNX` graph 

69 """ 

70 if hasattr(fct, '_score_func'): 

71 kwargs = fct._kwargs 

72 fct = fct._score_func 

73 else: 

74 kwargs = None # pragma: no cover 

75 if name is None: 

76 name = "mlprodict_fct_ONNX(%s)" % fct.__name__ 

77 tr = CustomScorerTransform(fct.__name__, fct, kwargs) 

78 return convert_sklearn( 

79 tr, initial_types=initial_types, 

80 target_opset=target_opset, options=options, 

81 custom_conversion_functions=custom_conversion_functions, 

82 custom_shape_calculators=custom_shape_calculators, 

83 custom_parsers=custom_parsers, white_op=white_op, 

84 black_op=black_op, final_types=final_types, 

85 verbose=verbose) 

86 

87 

88def guess_initial_types(X, initial_types): 

89 """ 

90 Guesses initial types from an array or a dataframe. 

91 

92 @param X array or dataframe 

93 @param initial_types hints about X 

94 @return data types 

95 """ 

96 if X is None and initial_types is None: 

97 raise NotImplementedError( # pragma: no cover 

98 "Initial types must be specified.") 

99 elif initial_types is None: 

100 if isinstance(X, (numpy.ndarray, pandas.DataFrame)): 

101 X = X[:1] 

102 if isinstance(X, pandas.DataFrame): 

103 initial_types = [] 

104 for c in X.columns: 

105 if isinstance(X[c].values[0], (str, numpy.str_)): 

106 g = StringTensorType() 

107 else: 

108 g = _guess_type(X[c].values) 

109 g.shape = [None, 1] 

110 initial_types.append((c, g)) 

111 else: 

112 gt = _guess_type(X) 

113 initial_types = [('X', gt)] 

114 return initial_types 

115 

116 

117def _replace_tensor_type(schema, tensor_type): 

118 res = [] 

119 for name, ty in schema: 

120 cl = ty.__class__ 

121 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type: 

122 ty = tensor_type(ty.shape) 

123 res.append((name, ty)) 

124 return res 

125 

126 

127def guess_schema_from_data(X, tensor_type=None, schema=None): 

128 """ 

129 Guesses initial types from a dataset. 

130 

131 @param X dataset (dataframe, array) 

132 @param tensor_type if not None, replaces every 

133 *FloatTensorType* or *DoubleTensorType* 

134 by this one 

135 @param schema known schema 

136 @return schema (list of typed and named columns) 

137 """ 

138 init = guess_initial_types(X, schema) 

139 if tensor_type is not None: 

140 init = _replace_tensor_type(init, tensor_type) 

141 # Grouping column 

142 unique = set() 

143 for _, col in init: 

144 if len(col.shape) != 2: 

145 return init # pragma: no cover 

146 if col.shape[0] is not None: 

147 return init # pragma: no cover 

148 if len(unique) > 0 and col.__class__ not in unique: 

149 return init # pragma: no cover 

150 unique.add(col.__class__) 

151 unique = list(unique) 

152 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))] 

153 

154 

155def get_inputs_from_data(X, schema=None): 

156 """ 

157 Produces input data for *onnx* runtime. 

158 

159 @param X data 

160 @param schema schema if None, schema is guessed with 

161 @see fn guess_schema_from_data 

162 @return input data 

163 """ 

164 def _cast_data(X, ct): 

165 if isinstance(ct, FloatTensorType): 

166 return X.astype(numpy.float32) 

167 if isinstance(ct, DoubleTensorType): 

168 return X.astype(numpy.float64) 

169 if isinstance(ct, StringTensorType): 

170 return X.astype(numpy.str_) 

171 if isinstance(ct, Int64TensorType): 

172 return X.astype(numpy.int64) 

173 raise RuntimeError( # pragma: no cover 

174 "Unexpected column type {} for type {}." 

175 "".format(ct, type(X))) 

176 

177 if schema is None: 

178 schema = guess_schema_from_data(X) 

179 if isinstance(X, numpy.ndarray): 

180 if len(schema) != 1: 

181 raise RuntimeError( # pragma: no cover 

182 "More than one column but input is an array.") 

183 return {schema[0][0]: _cast_data(X, schema[0][1])} 

184 if isinstance(X, pandas.DataFrame): 

185 if len(schema) != X.shape[1]: 

186 raise RuntimeError( # pragma: no cover 

187 "Mismatch between onnx columns {} and DataFrame columns {}" 

188 "".format(len(schema), X.shape[1])) 

189 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1)) 

190 for sch, c in zip(schema, X.columns)} 

191 raise TypeError( # pragma: no cover 

192 "Unexpected type {}, expecting an array or a dataframe." 

193 "".format(type(X))) 

194 

195 

196def guess_schema_from_model(model, tensor_type=None, schema=None): 

197 """ 

198 Guesses initial types from a model. 

199 

200 @param model model 

201 @param tensor_type if not None, replaces every 

202 *FloatTensorType* or *DoubleTensorType* 

203 by this one 

204 @param schema known schema 

205 @return schema (list of typed and named columns) 

206 """ 

207 if schema is not None: 

208 try: 

209 guessed = guess_schema_from_model(model) 

210 except NotImplementedError: # pragma: no cover 

211 return _replace_tensor_type(schema, tensor_type) 

212 if len(guessed) != len(schema): 

213 raise RuntimeError( # pragma: no cover 

214 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format( 

215 schema, guessed)) 

216 return _replace_tensor_type(schema, tensor_type) 

217 

218 if hasattr(model, 'coef_'): 

219 # linear model 

220 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))] 

221 return _replace_tensor_type(init, tensor_type) 

222 elif hasattr(model, 'dump_model'): 

223 dumped = model.dump_model() 

224 if isinstance(dumped, dict) and 'feature_names' in dumped: 

225 names = dumped['feature_names'] 

226 init = [(name, FloatTensorType([None, 1])) for name in names] 

227 return _replace_tensor_type(init, tensor_type) 

228 

229 data = pprint.pformat(model.__dict__) 

230 dirs = pprint.pformat(dir(model)) 

231 if hasattr(model, 'dump_model'): # pragma: no cover 

232 dumped = model.dump_model() 

233 keys = list(sorted(dumped)) 

234 last = pprint.pformat([keys, dumped]) 

235 if len(last) >= 200000: 

236 last = last[:200000] + "\n..." 

237 else: 

238 last = "" 

239 raise NotImplementedError( # pragma: no cover 

240 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format( 

241 model.__class__, data, dirs, last)) 

242 

243 

244def to_onnx(model, X=None, name=None, initial_types=None, 

245 target_opset=None, options=None, rewrite_ops=False, 

246 white_op=None, black_op=None, final_types=None, 

247 rename_strategy=None, verbose=0): 

248 """ 

249 Converts a model using on :epkg:`sklearn-onnx`. 

250 

251 :param model: model to convert or a function 

252 wrapped into :epkg:`_PredictScorer` with 

253 function :epkg:`make_scorer` 

254 :param X: training set (at least one row), 

255 can be None, it is used to infered the 

256 input types (*initial_types*) 

257 :param initial_types: if *X* is None, then *initial_types* 

258 must be defined 

259 :param name: name of the produced model 

260 :param target_opset: to do it with a different target opset 

261 :param options: additional parameters for the conversion 

262 :param rewrite_ops: rewrites some existing converters, 

263 the changes are permanent 

264 :param white_op: white list of ONNX nodes allowed 

265 while converting a pipeline, if empty, all are allowed 

266 :param black_op: black list of ONNX nodes allowed 

267 while converting a pipeline, if empty, 

268 none are blacklisted 

269 :param final_types: a python list. Works the same way as 

270 initial_types but not mandatory, it is used 

271 to overwrites the type (if type is not None) 

272 and the name of every output. 

273 :param rename_strategy: rename any name in the graph, select shorter 

274 names, see @see fn onnx_rename_names 

275 :param verbose: display information while converting the model 

276 :return: converted model 

277 

278 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx` 

279 but may changes a few converters if *rewrite_ops* is True. 

280 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor* 

281 for float but not for double. It becomes available 

282 if ``rewrite_ops=True``. 

283 

284 .. faqref:: 

285 :title: How to deal with a dataframe as input? 

286 

287 Each column of the dataframe is considered as an named input. 

288 The first step is to make sure that every column type is correct. 

289 :epkg:`pandas` tends to select the least generic type to 

290 hold the content of one column. :epkg:`ONNX` does not automatically 

291 cast the data it receives. The data must have the same type with 

292 the model is converted and when the converted model receives 

293 the data to predict. 

294 

295 .. runpython:: 

296 :showcode: 

297 :warningout: DeprecationWarning 

298 

299 from io import StringIO 

300 from textwrap import dedent 

301 import numpy 

302 import pandas 

303 from pyquickhelper.pycode import ExtTestCase 

304 from sklearn.preprocessing import OneHotEncoder 

305 from sklearn.pipeline import Pipeline 

306 from sklearn.compose import ColumnTransformer 

307 from mlprodict.onnx_conv import to_onnx 

308 from mlprodict.onnxrt import OnnxInference 

309 

310 text = dedent(''' 

311 __SCHEMA__ 

312 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red 

313 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red 

314 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red 

315 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red 

316 ''') 

317 text = text.replace( 

318 "__SCHEMA__", 

319 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides," 

320 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates," 

321 "alcohol,quality,color") 

322 

323 X_train = pandas.read_csv(StringIO(text)) 

324 for c in X_train.columns: 

325 if c != 'color': 

326 X_train[c] = X_train[c].astype(numpy.float32) 

327 numeric_features = [c for c in X_train if c != 'color'] 

328 

329 pipe = Pipeline([ 

330 ("prep", ColumnTransformer([ 

331 ("color", Pipeline([ 

332 ('one', OneHotEncoder()), 

333 ('select', ColumnTransformer( 

334 [('sel1', 'passthrough', [0])])) 

335 ]), ['color']), 

336 ("others", "passthrough", numeric_features) 

337 ])), 

338 ]) 

339 

340 pipe.fit(X_train) 

341 pred = pipe.transform(X_train) 

342 print(pred) 

343 

344 model_onnx = to_onnx(pipe, X_train, target_opset=12) 

345 oinf = OnnxInference(model_onnx) 

346 

347 # The dataframe is converted into a dictionary, 

348 # each key is a column name, each value is a numpy array. 

349 inputs = {c: X_train[c].values for c in X_train.columns} 

350 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()} 

351 

352 onxp = oinf.run(inputs) 

353 print(onxp) 

354 

355 .. versionchanged:: 0.7 

356 Parameter *rename_strategy* was added. 

357 """ 

358 if isinstance(model, OnnxOperatorMixin): 

359 if not hasattr(model, 'op_version'): 

360 raise RuntimeError( # pragma: no cover 

361 "Missing attribute 'op_version' for type '{}'.".format( 

362 type(model))) 

363 return model.to_onnx( 

364 X=X, name=name, options=options, black_op=black_op, 

365 white_op=white_op, final_types=final_types) 

366 # verbose=verbose) 

367 

368 if rewrite_ops: 

369 old_values, old_shapes = register_rewritten_operators() 

370 register_converters() 

371 else: 

372 old_values, old_shapes = {}, {} 

373 

374 def _guess_type_(X, itype, dtype): 

375 initial_types = guess_initial_types(X, itype) 

376 if dtype is None: 

377 if hasattr(X, 'dtypes'): # DataFrame 

378 dtype = numpy.float32 

379 elif hasattr(X, 'dtype'): 

380 dtype = X.dtype 

381 elif hasattr(X, 'type'): 

382 dtype = guess_numpy_type(X.type) 

383 elif initial_types is not None: 

384 dtype = guess_numpy_type(initial_types[0][1]) 

385 else: 

386 raise RuntimeError( # pragma: no cover 

387 "dtype cannot be guessed: {}".format( 

388 type(X))) 

389 if dtype != numpy.float64: 

390 dtype = numpy.float32 

391 if dtype is None: 

392 raise RuntimeError("dtype cannot be None") # pragma: no cover 

393 if isinstance(dtype, FloatTensorType): 

394 dtype = numpy.float32 # pragma: no cover 

395 elif isinstance(dtype, DoubleTensorType): 

396 dtype = numpy.float64 # pragma: no cover 

397 new_dtype = dtype 

398 if isinstance(dtype, numpy.ndarray): 

399 new_dtype = dtype.dtype # pragma: no cover 

400 elif isinstance(dtype, DataType): 

401 new_dtype = numpy.float32 # pragma: no cover 

402 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64, 

403 numpy.int32, numpy.float16): 

404 raise NotImplementedError( # pragma: no cover 

405 "dtype should be real not {} ({})".format(new_dtype, dtype)) 

406 return initial_types, dtype, new_dtype 

407 

408 if isinstance(model, _PredictScorer): 

409 if X is not None and not isinstance(X, OrderedDict): 

410 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}." 

411 "".format(type(X))) 

412 if initial_types is None: 

413 dts = [] 

414 initial_types = [] 

415 for k, v in X.items(): 

416 if hasattr(v, 'dtype'): 

417 dtype = guess_numpy_type(v.dtype) 

418 else: 

419 dtype = v # pragma: no cover 

420 it, _, ndt = _guess_type_(v, None, dtype) 

421 for i in range(len(it)): # pylint: disable=C0200 

422 it[i] = (k, it[i][1]) # pylint: disable=C0200 

423 initial_types.extend(it) 

424 dts.append(ndt) 

425 ndt = set(dts) 

426 if len(ndt) != 1: 

427 raise RuntimeError( # pragma: no cover 

428 "Multiple dtype is not efficient {}.".format(ndt)) 

429 res = convert_scorer(model, initial_types, name=name, 

430 target_opset=target_opset, options=options, 

431 black_op=black_op, white_op=white_op, 

432 final_types=final_types, verbose=verbose) 

433 else: 

434 if name is None: 

435 name = "mlprodict_ONNX(%s)" % model.__class__.__name__ 

436 

437 initial_types, dtype, _ = _guess_type_(X, initial_types, None) 

438 res = convert_sklearn(model, initial_types=initial_types, name=name, 

439 target_opset=target_opset, options=options, 

440 black_op=black_op, white_op=white_op, 

441 final_types=final_types, verbose=verbose) 

442 

443 register_rewritten_operators(old_values, old_shapes) 

444 

445 # optimisation 

446 if rename_strategy is not None: 

447 res = onnx_rename_names(res, strategy=rename_strategy) 

448 return res