Coverage for mlprodict/onnx_conv/convert.py: 91%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- encoding: utf-8 -*-

2"""

3@file

4@brief Overloads a conversion function.

5"""

6import pprint

7from collections import OrderedDict

8import numpy

9import pandas

10try:

11 from sklearn.metrics._scorer import _PredictScorer

12except ImportError: # pragma: no cover

13 # scikit-learn < 0.22

14 from sklearn.metrics.scorer import _PredictScorer

15from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version

16from skl2onnx.common.data_types import (

17 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,

18 StringTensorType, Int64TensorType)

19from skl2onnx import convert_sklearn

20from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin

21from skl2onnx.algebra.type_helper import _guess_type

22from ..onnx_tools.onnx_manipulations import onnx_rename_names

23from .register_rewritten_converters import register_rewritten_operators

24from .register import register_converters

25from .scorers import CustomScorerTransform

28def convert_scorer(fct, initial_types, name=None,

29 target_opset=None, options=None,

30 custom_conversion_functions=None,

31 custom_shape_calculators=None,

32 custom_parsers=None, white_op=None,

33 black_op=None, final_types=None,

34 verbose=0):

35 """

36 Converts a scorer into :epkg:`ONNX` assuming

37 there exists a converter associated to it.

38 The function wraps the function into a custom

39 transformer, then calls function *convert_sklearn*

40 from :epkg:`sklearn-onnx`.

42 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`)

43 :param initial_types: types information

44 :param name: name of the produced model

45 :param target_opset: to do it with a different target opset

46 :param options: additional parameters for the conversion

47 :param custom_conversion_functions: a dictionary for specifying the user

48 customized conversion function, it takes precedence over

49 registered converters

50 :param custom_shape_calculators: a dictionary for specifying the user

51 customized shape calculator it takes precedence over registered

52 shape calculators.

53 :param custom_parsers: parsers determine which outputs is expected

54 for which particular task, default parsers are

55 defined for classifiers, regressors, pipeline but

56 they can be rewritten, *custom_parsers* is a dictionary

57 ``{ type: fct_parser(scope, model, inputs,

58 custom_parsers=None) }``

59 :param white_op: white list of ONNX nodes allowed

60 while converting a pipeline, if empty, all are allowed

61 :param black_op: black list of ONNX nodes allowed

62 while converting a pipeline, if empty, none are blacklisted

63 :param final_types: a python list. Works the same way as

64 initial_types but not mandatory, it is used

65 to overwrites the type (if type is not None)

66 and the name of every output.

67 :param verbose: displays information while converting

68 :return: :epkg:`ONNX` graph

69 """

70 if hasattr(fct, '_score_func'):

71 kwargs = fct._kwargs

72 fct = fct._score_func

73 else:

74 kwargs = None # pragma: no cover

75 if name is None:

76 name = "mlprodict_fct_ONNX(%s)" % fct.__name__

77 tr = CustomScorerTransform(fct.__name__, fct, kwargs)

78 return convert_sklearn(

79 tr, initial_types=initial_types,

80 target_opset=target_opset, options=options,

81 custom_conversion_functions=custom_conversion_functions,

82 custom_shape_calculators=custom_shape_calculators,

83 custom_parsers=custom_parsers, white_op=white_op,

84 black_op=black_op, final_types=final_types,

85 verbose=verbose)

88def guess_initial_types(X, initial_types):

89 """

90 Guesses initial types from an array or a dataframe.

92 @param X array or dataframe

93 @param initial_types hints about X

94 @return data types

95 """

96 if X is None and initial_types is None:

97 raise NotImplementedError( # pragma: no cover

98 "Initial types must be specified.")

99 elif initial_types is None:

100 if isinstance(X, (numpy.ndarray, pandas.DataFrame)):

101 X = X[:1]

102 if isinstance(X, pandas.DataFrame):

103 initial_types = []

104 for c in X.columns:

105 if isinstance(X[c].values[0], (str, numpy.str_)):

106 g = StringTensorType()

107 else:

108 g = _guess_type(X[c].values)

109 g.shape = [None, 1]

110 initial_types.append((c, g))

111 else:

112 gt = _guess_type(X)

113 initial_types = [('X', gt)]

114 return initial_types

115

116

117def _replace_tensor_type(schema, tensor_type):

118 res = []

119 for name, ty in schema:

120 cl = ty.__class__

121 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type:

122 ty = tensor_type(ty.shape)

123 res.append((name, ty))

124 return res

125

126

127def guess_schema_from_data(X, tensor_type=None, schema=None):

128 """

129 Guesses initial types from a dataset.

130

131 @param X dataset (dataframe, array)

132 @param tensor_type if not None, replaces every

133 *FloatTensorType* or *DoubleTensorType*

134 by this one

135 @param schema known schema

136 @return schema (list of typed and named columns)

137 """

138 init = guess_initial_types(X, schema)

139 if tensor_type is not None:

140 init = _replace_tensor_type(init, tensor_type)

141 # Grouping column

142 unique = set()

143 for _, col in init:

144 if len(col.shape) != 2:

145 return init # pragma: no cover

146 if col.shape[0] is not None:

147 return init # pragma: no cover

148 if len(unique) > 0 and col.__class__ not in unique:

149 return init # pragma: no cover

150 unique.add(col.__class__)

151 unique = list(unique)

152 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]

153

154

155def get_inputs_from_data(X, schema=None):

156 """

157 Produces input data for *onnx* runtime.

158

159 @param X data

160 @param schema schema if None, schema is guessed with

161 @see fn guess_schema_from_data

162 @return input data

163 """

164 def _cast_data(X, ct):

165 if isinstance(ct, FloatTensorType):

166 return X.astype(numpy.float32)

167 if isinstance(ct, DoubleTensorType):

168 return X.astype(numpy.float64)

169 if isinstance(ct, StringTensorType):

170 return X.astype(numpy.str_)

171 if isinstance(ct, Int64TensorType):

172 return X.astype(numpy.int64)

173 raise RuntimeError( # pragma: no cover

174 "Unexpected column type {} for type {}."

175 "".format(ct, type(X)))

176

177 if schema is None:

178 schema = guess_schema_from_data(X)

179 if isinstance(X, numpy.ndarray):

180 if len(schema) != 1:

181 raise RuntimeError( # pragma: no cover

182 "More than one column but input is an array.")

183 return {schema[0][0]: _cast_data(X, schema[0][1])}

184 if isinstance(X, pandas.DataFrame):

185 if len(schema) != X.shape[1]:

186 raise RuntimeError( # pragma: no cover

187 "Mismatch between onnx columns {} and DataFrame columns {}"

188 "".format(len(schema), X.shape[1]))

189 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))

190 for sch, c in zip(schema, X.columns)}

191 raise TypeError( # pragma: no cover

192 "Unexpected type {}, expecting an array or a dataframe."

193 "".format(type(X)))

194

195

196def guess_schema_from_model(model, tensor_type=None, schema=None):

197 """

198 Guesses initial types from a model.

199

200 @param model model

201 @param tensor_type if not None, replaces every

202 *FloatTensorType* or *DoubleTensorType*

203 by this one

204 @param schema known schema

205 @return schema (list of typed and named columns)

206 """

207 if schema is not None:

208 try:

209 guessed = guess_schema_from_model(model)

210 except NotImplementedError: # pragma: no cover

211 return _replace_tensor_type(schema, tensor_type)

212 if len(guessed) != len(schema):

213 raise RuntimeError( # pragma: no cover

214 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format(

215 schema, guessed))

216 return _replace_tensor_type(schema, tensor_type)

217

218 if hasattr(model, 'coef_'):

219 # linear model

220 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))]

221 return _replace_tensor_type(init, tensor_type)

222 elif hasattr(model, 'dump_model'):

223 dumped = model.dump_model()

224 if isinstance(dumped, dict) and 'feature_names' in dumped:

225 names = dumped['feature_names']

226 init = [(name, FloatTensorType([None, 1])) for name in names]

227 return _replace_tensor_type(init, tensor_type)

228

229 data = pprint.pformat(model.__dict__)

230 dirs = pprint.pformat(dir(model))

231 if hasattr(model, 'dump_model'): # pragma: no cover

232 dumped = model.dump_model()

233 keys = list(sorted(dumped))

234 last = pprint.pformat([keys, dumped])

235 if len(last) >= 200000:

236 last = last[:200000] + "\n..."

237 else:

238 last = ""

239 raise NotImplementedError( # pragma: no cover

240 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format(

241 model.__class__, data, dirs, last))

242

243

244def to_onnx(model, X=None, name=None, initial_types=None,

245 target_opset=None, options=None, rewrite_ops=False,

246 white_op=None, black_op=None, final_types=None,

247 rename_strategy=None, verbose=0):

248 """

249 Converts a model using on :epkg:`sklearn-onnx`.

250

251 :param model: model to convert or a function

252 wrapped into :epkg:`_PredictScorer` with

253 function :epkg:`make_scorer`

254 :param X: training set (at least one row),

255 can be None, it is used to infered the

256 input types (*initial_types*)

257 :param initial_types: if *X* is None, then *initial_types*

258 must be defined

259 :param name: name of the produced model

260 :param target_opset: to do it with a different target opset

261 :param options: additional parameters for the conversion

262 :param rewrite_ops: rewrites some existing converters,

263 the changes are permanent

264 :param white_op: white list of ONNX nodes allowed

265 while converting a pipeline, if empty, all are allowed

266 :param black_op: black list of ONNX nodes allowed

267 while converting a pipeline, if empty,

268 none are blacklisted

269 :param final_types: a python list. Works the same way as

270 initial_types but not mandatory, it is used

271 to overwrites the type (if type is not None)

272 and the name of every output.

273 :param rename_strategy: rename any name in the graph, select shorter

274 names, see @see fn onnx_rename_names

275 :param verbose: display information while converting the model

276 :return: converted model

277

278 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx`

279 but may changes a few converters if *rewrite_ops* is True.

280 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor*

281 for float but not for double. It becomes available

282 if ``rewrite_ops=True``.

283

284 .. faqref::

285 :title: How to deal with a dataframe as input?

286

287 Each column of the dataframe is considered as an named input.

288 The first step is to make sure that every column type is correct.

289 :epkg:`pandas` tends to select the least generic type to

290 hold the content of one column. :epkg:`ONNX` does not automatically

291 cast the data it receives. The data must have the same type with

292 the model is converted and when the converted model receives

293 the data to predict.

294

295 .. runpython::

296 :showcode:

297 :warningout: DeprecationWarning

298

299 from io import StringIO

300 from textwrap import dedent

301 import numpy

302 import pandas

303 from pyquickhelper.pycode import ExtTestCase

304 from sklearn.preprocessing import OneHotEncoder

305 from sklearn.pipeline import Pipeline

306 from sklearn.compose import ColumnTransformer

307 from mlprodict.onnx_conv import to_onnx

308 from mlprodict.onnxrt import OnnxInference

309

310 text = dedent('''

311 __SCHEMA__

312 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red

313 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red

314 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red

315 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red

316 ''')

317 text = text.replace(

318 "__SCHEMA__",

319 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,"

320 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,"

321 "alcohol,quality,color")

322

323 X_train = pandas.read_csv(StringIO(text))

324 for c in X_train.columns:

325 if c != 'color':

326 X_train[c] = X_train[c].astype(numpy.float32)

327 numeric_features = [c for c in X_train if c != 'color']

328

329 pipe = Pipeline([

330 ("prep", ColumnTransformer([

331 ("color", Pipeline([

332 ('one', OneHotEncoder()),

333 ('select', ColumnTransformer(

334 [('sel1', 'passthrough', [0])]))

335 ]), ['color']),

336 ("others", "passthrough", numeric_features)

337 ])),

338 ])

339

340 pipe.fit(X_train)

341 pred = pipe.transform(X_train)

342 print(pred)

343

344 model_onnx = to_onnx(pipe, X_train, target_opset=12)

345 oinf = OnnxInference(model_onnx)

346

347 # The dataframe is converted into a dictionary,

348 # each key is a column name, each value is a numpy array.

349 inputs = {c: X_train[c].values for c in X_train.columns}

350 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}

351

352 onxp = oinf.run(inputs)

353 print(onxp)

354

355 .. versionchanged:: 0.7

356 Parameter *rename_strategy* was added.

357 """

358 if isinstance(model, OnnxOperatorMixin):

359 if not hasattr(model, 'op_version'):

360 raise RuntimeError( # pragma: no cover

361 "Missing attribute 'op_version' for type '{}'.".format(

362 type(model)))

363 return model.to_onnx(

364 X=X, name=name, options=options, black_op=black_op,

365 white_op=white_op, final_types=final_types)

366 # verbose=verbose)

367

368 if rewrite_ops:

369 old_values, old_shapes = register_rewritten_operators()

370 register_converters()

371 else:

372 old_values, old_shapes = {}, {}

373

374 def _guess_type_(X, itype, dtype):

375 initial_types = guess_initial_types(X, itype)

376 if dtype is None:

377 if hasattr(X, 'dtypes'): # DataFrame

378 dtype = numpy.float32

379 elif hasattr(X, 'dtype'):

380 dtype = X.dtype

381 elif hasattr(X, 'type'):

382 dtype = guess_numpy_type(X.type)

383 elif initial_types is not None:

384 dtype = guess_numpy_type(initial_types[0][1])

385 else:

386 raise RuntimeError( # pragma: no cover

387 "dtype cannot be guessed: {}".format(

388 type(X)))

389 if dtype != numpy.float64:

390 dtype = numpy.float32

391 if dtype is None:

392 raise RuntimeError("dtype cannot be None") # pragma: no cover

393 if isinstance(dtype, FloatTensorType):

394 dtype = numpy.float32 # pragma: no cover

395 elif isinstance(dtype, DoubleTensorType):

396 dtype = numpy.float64 # pragma: no cover

397 new_dtype = dtype

398 if isinstance(dtype, numpy.ndarray):

399 new_dtype = dtype.dtype # pragma: no cover

400 elif isinstance(dtype, DataType):

401 new_dtype = numpy.float32 # pragma: no cover

402 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64,

403 numpy.int32, numpy.float16):

404 raise NotImplementedError( # pragma: no cover

405 "dtype should be real not {} ({})".format(new_dtype, dtype))

406 return initial_types, dtype, new_dtype

407

408 if isinstance(model, _PredictScorer):

409 if X is not None and not isinstance(X, OrderedDict):

410 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}."

411 "".format(type(X)))

412 if initial_types is None:

413 dts = []

414 initial_types = []

415 for k, v in X.items():

416 if hasattr(v, 'dtype'):

417 dtype = guess_numpy_type(v.dtype)

418 else:

419 dtype = v # pragma: no cover

420 it, _, ndt = _guess_type_(v, None, dtype)

421 for i in range(len(it)): # pylint: disable=C0200

422 it[i] = (k, it[i][1]) # pylint: disable=C0200

423 initial_types.extend(it)

424 dts.append(ndt)

425 ndt = set(dts)

426 if len(ndt) != 1:

427 raise RuntimeError( # pragma: no cover

428 "Multiple dtype is not efficient {}.".format(ndt))

429 res = convert_scorer(model, initial_types, name=name,

430 target_opset=target_opset, options=options,

431 black_op=black_op, white_op=white_op,

432 final_types=final_types, verbose=verbose)

433 else:

434 if name is None:

435 name = "mlprodict_ONNX(%s)" % model.__class__.__name__

436

437 initial_types, dtype, _ = _guess_type_(X, initial_types, None)

438 res = convert_sklearn(model, initial_types=initial_types, name=name,

439 target_opset=target_opset, options=options,

440 black_op=black_op, white_op=white_op,

441 final_types=final_types, verbose=verbose)

442

443 register_rewritten_operators(old_values, old_shapes)

444

445 # optimisation

446 if rename_strategy is not None:

447 res = onnx_rename_names(res, strategy=rename_strategy)

448 return res

Coverage for mlprodict/onnx_conv/convert.py : 91%

150 statements

Coverage for mlprodict/onnx_conv/convert.py : 91%

150 statements 137 run 13 missing 31 excluded

150 statements