Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- encoding: utf-8 -*-
2"""
3@file
4@brief Overloads a conversion function.
5"""
6import pprint
7from collections import OrderedDict
8import numpy
9import pandas
10try:
11 from sklearn.metrics._scorer import _PredictScorer
12except ImportError: # pragma: no cover
13 # scikit-learn < 0.22
14 from sklearn.metrics.scorer import _PredictScorer
15from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
16from skl2onnx.common.data_types import (
17 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,
18 StringTensorType, Int64TensorType)
19from skl2onnx import convert_sklearn
20from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin
21from skl2onnx.algebra.type_helper import _guess_type
22from ..onnx_tools.onnx_manipulations import onnx_rename_names
23from .register_rewritten_converters import register_rewritten_operators
24from .register import register_converters
25from .scorers import CustomScorerTransform
28def convert_scorer(fct, initial_types, name=None,
29 target_opset=None, options=None,
30 custom_conversion_functions=None,
31 custom_shape_calculators=None,
32 custom_parsers=None, white_op=None,
33 black_op=None, final_types=None,
34 verbose=0):
35 """
36 Converts a scorer into :epkg:`ONNX` assuming
37 there exists a converter associated to it.
38 The function wraps the function into a custom
39 transformer, then calls function *convert_sklearn*
40 from :epkg:`sklearn-onnx`.
42 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`)
43 :param initial_types: types information
44 :param name: name of the produced model
45 :param target_opset: to do it with a different target opset
46 :param options: additional parameters for the conversion
47 :param custom_conversion_functions: a dictionary for specifying the user
48 customized conversion function, it takes precedence over
49 registered converters
50 :param custom_shape_calculators: a dictionary for specifying the user
51 customized shape calculator it takes precedence over registered
52 shape calculators.
53 :param custom_parsers: parsers determine which outputs is expected
54 for which particular task, default parsers are
55 defined for classifiers, regressors, pipeline but
56 they can be rewritten, *custom_parsers* is a dictionary
57 ``{ type: fct_parser(scope, model, inputs,
58 custom_parsers=None) }``
59 :param white_op: white list of ONNX nodes allowed
60 while converting a pipeline, if empty, all are allowed
61 :param black_op: black list of ONNX nodes allowed
62 while converting a pipeline, if empty, none are blacklisted
63 :param final_types: a python list. Works the same way as
64 initial_types but not mandatory, it is used
65 to overwrites the type (if type is not None)
66 and the name of every output.
67 :param verbose: displays information while converting
68 :return: :epkg:`ONNX` graph
69 """
70 if hasattr(fct, '_score_func'):
71 kwargs = fct._kwargs
72 fct = fct._score_func
73 else:
74 kwargs = None # pragma: no cover
75 if name is None:
76 name = "mlprodict_fct_ONNX(%s)" % fct.__name__
77 tr = CustomScorerTransform(fct.__name__, fct, kwargs)
78 return convert_sklearn(
79 tr, initial_types=initial_types,
80 target_opset=target_opset, options=options,
81 custom_conversion_functions=custom_conversion_functions,
82 custom_shape_calculators=custom_shape_calculators,
83 custom_parsers=custom_parsers, white_op=white_op,
84 black_op=black_op, final_types=final_types,
85 verbose=verbose)
88def guess_initial_types(X, initial_types):
89 """
90 Guesses initial types from an array or a dataframe.
92 @param X array or dataframe
93 @param initial_types hints about X
94 @return data types
95 """
96 if X is None and initial_types is None:
97 raise NotImplementedError( # pragma: no cover
98 "Initial types must be specified.")
99 elif initial_types is None:
100 if isinstance(X, (numpy.ndarray, pandas.DataFrame)):
101 X = X[:1]
102 if isinstance(X, pandas.DataFrame):
103 initial_types = []
104 for c in X.columns:
105 if isinstance(X[c].values[0], (str, numpy.str_)):
106 g = StringTensorType()
107 else:
108 g = _guess_type(X[c].values)
109 g.shape = [None, 1]
110 initial_types.append((c, g))
111 else:
112 gt = _guess_type(X)
113 initial_types = [('X', gt)]
114 return initial_types
117def _replace_tensor_type(schema, tensor_type):
118 res = []
119 for name, ty in schema:
120 cl = ty.__class__
121 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type:
122 ty = tensor_type(ty.shape)
123 res.append((name, ty))
124 return res
127def guess_schema_from_data(X, tensor_type=None, schema=None):
128 """
129 Guesses initial types from a dataset.
131 @param X dataset (dataframe, array)
132 @param tensor_type if not None, replaces every
133 *FloatTensorType* or *DoubleTensorType*
134 by this one
135 @param schema known schema
136 @return schema (list of typed and named columns)
137 """
138 init = guess_initial_types(X, schema)
139 if tensor_type is not None:
140 init = _replace_tensor_type(init, tensor_type)
141 # Grouping column
142 unique = set()
143 for _, col in init:
144 if len(col.shape) != 2:
145 return init # pragma: no cover
146 if col.shape[0] is not None:
147 return init # pragma: no cover
148 if len(unique) > 0 and col.__class__ not in unique:
149 return init # pragma: no cover
150 unique.add(col.__class__)
151 unique = list(unique)
152 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]
155def get_inputs_from_data(X, schema=None):
156 """
157 Produces input data for *onnx* runtime.
159 @param X data
160 @param schema schema if None, schema is guessed with
161 @see fn guess_schema_from_data
162 @return input data
163 """
164 def _cast_data(X, ct):
165 if isinstance(ct, FloatTensorType):
166 return X.astype(numpy.float32)
167 if isinstance(ct, DoubleTensorType):
168 return X.astype(numpy.float64)
169 if isinstance(ct, StringTensorType):
170 return X.astype(numpy.str_)
171 if isinstance(ct, Int64TensorType):
172 return X.astype(numpy.int64)
173 raise RuntimeError( # pragma: no cover
174 "Unexpected column type {} for type {}."
175 "".format(ct, type(X)))
177 if schema is None:
178 schema = guess_schema_from_data(X)
179 if isinstance(X, numpy.ndarray):
180 if len(schema) != 1:
181 raise RuntimeError( # pragma: no cover
182 "More than one column but input is an array.")
183 return {schema[0][0]: _cast_data(X, schema[0][1])}
184 if isinstance(X, pandas.DataFrame):
185 if len(schema) != X.shape[1]:
186 raise RuntimeError( # pragma: no cover
187 "Mismatch between onnx columns {} and DataFrame columns {}"
188 "".format(len(schema), X.shape[1]))
189 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))
190 for sch, c in zip(schema, X.columns)}
191 raise TypeError( # pragma: no cover
192 "Unexpected type {}, expecting an array or a dataframe."
193 "".format(type(X)))
196def guess_schema_from_model(model, tensor_type=None, schema=None):
197 """
198 Guesses initial types from a model.
200 @param model model
201 @param tensor_type if not None, replaces every
202 *FloatTensorType* or *DoubleTensorType*
203 by this one
204 @param schema known schema
205 @return schema (list of typed and named columns)
206 """
207 if schema is not None:
208 try:
209 guessed = guess_schema_from_model(model)
210 except NotImplementedError: # pragma: no cover
211 return _replace_tensor_type(schema, tensor_type)
212 if len(guessed) != len(schema):
213 raise RuntimeError( # pragma: no cover
214 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format(
215 schema, guessed))
216 return _replace_tensor_type(schema, tensor_type)
218 if hasattr(model, 'coef_'):
219 # linear model
220 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))]
221 return _replace_tensor_type(init, tensor_type)
222 elif hasattr(model, 'dump_model'):
223 dumped = model.dump_model()
224 if isinstance(dumped, dict) and 'feature_names' in dumped:
225 names = dumped['feature_names']
226 init = [(name, FloatTensorType([None, 1])) for name in names]
227 return _replace_tensor_type(init, tensor_type)
229 data = pprint.pformat(model.__dict__)
230 dirs = pprint.pformat(dir(model))
231 if hasattr(model, 'dump_model'): # pragma: no cover
232 dumped = model.dump_model()
233 keys = list(sorted(dumped))
234 last = pprint.pformat([keys, dumped])
235 if len(last) >= 200000:
236 last = last[:200000] + "\n..."
237 else:
238 last = ""
239 raise NotImplementedError( # pragma: no cover
240 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format(
241 model.__class__, data, dirs, last))
244def to_onnx(model, X=None, name=None, initial_types=None,
245 target_opset=None, options=None, rewrite_ops=False,
246 white_op=None, black_op=None, final_types=None,
247 rename_strategy=None, verbose=0):
248 """
249 Converts a model using on :epkg:`sklearn-onnx`.
251 :param model: model to convert or a function
252 wrapped into :epkg:`_PredictScorer` with
253 function :epkg:`make_scorer`
254 :param X: training set (at least one row),
255 can be None, it is used to infered the
256 input types (*initial_types*)
257 :param initial_types: if *X* is None, then *initial_types*
258 must be defined
259 :param name: name of the produced model
260 :param target_opset: to do it with a different target opset
261 :param options: additional parameters for the conversion
262 :param rewrite_ops: rewrites some existing converters,
263 the changes are permanent
264 :param white_op: white list of ONNX nodes allowed
265 while converting a pipeline, if empty, all are allowed
266 :param black_op: black list of ONNX nodes allowed
267 while converting a pipeline, if empty,
268 none are blacklisted
269 :param final_types: a python list. Works the same way as
270 initial_types but not mandatory, it is used
271 to overwrites the type (if type is not None)
272 and the name of every output.
273 :param rename_strategy: rename any name in the graph, select shorter
274 names, see @see fn onnx_rename_names
275 :param verbose: display information while converting the model
276 :return: converted model
278 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx`
279 but may changes a few converters if *rewrite_ops* is True.
280 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor*
281 for float but not for double. It becomes available
282 if ``rewrite_ops=True``.
284 .. faqref::
285 :title: How to deal with a dataframe as input?
287 Each column of the dataframe is considered as an named input.
288 The first step is to make sure that every column type is correct.
289 :epkg:`pandas` tends to select the least generic type to
290 hold the content of one column. :epkg:`ONNX` does not automatically
291 cast the data it receives. The data must have the same type with
292 the model is converted and when the converted model receives
293 the data to predict.
295 .. runpython::
296 :showcode:
297 :warningout: DeprecationWarning
299 from io import StringIO
300 from textwrap import dedent
301 import numpy
302 import pandas
303 from pyquickhelper.pycode import ExtTestCase
304 from sklearn.preprocessing import OneHotEncoder
305 from sklearn.pipeline import Pipeline
306 from sklearn.compose import ColumnTransformer
307 from mlprodict.onnx_conv import to_onnx
308 from mlprodict.onnxrt import OnnxInference
310 text = dedent('''
311 __SCHEMA__
312 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
313 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
314 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
315 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
316 ''')
317 text = text.replace(
318 "__SCHEMA__",
319 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,"
320 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,"
321 "alcohol,quality,color")
323 X_train = pandas.read_csv(StringIO(text))
324 for c in X_train.columns:
325 if c != 'color':
326 X_train[c] = X_train[c].astype(numpy.float32)
327 numeric_features = [c for c in X_train if c != 'color']
329 pipe = Pipeline([
330 ("prep", ColumnTransformer([
331 ("color", Pipeline([
332 ('one', OneHotEncoder()),
333 ('select', ColumnTransformer(
334 [('sel1', 'passthrough', [0])]))
335 ]), ['color']),
336 ("others", "passthrough", numeric_features)
337 ])),
338 ])
340 pipe.fit(X_train)
341 pred = pipe.transform(X_train)
342 print(pred)
344 model_onnx = to_onnx(pipe, X_train, target_opset=12)
345 oinf = OnnxInference(model_onnx)
347 # The dataframe is converted into a dictionary,
348 # each key is a column name, each value is a numpy array.
349 inputs = {c: X_train[c].values for c in X_train.columns}
350 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}
352 onxp = oinf.run(inputs)
353 print(onxp)
355 .. versionchanged:: 0.7
356 Parameter *rename_strategy* was added.
357 """
358 if isinstance(model, OnnxOperatorMixin):
359 if not hasattr(model, 'op_version'):
360 raise RuntimeError( # pragma: no cover
361 "Missing attribute 'op_version' for type '{}'.".format(
362 type(model)))
363 return model.to_onnx(
364 X=X, name=name, options=options, black_op=black_op,
365 white_op=white_op, final_types=final_types)
366 # verbose=verbose)
368 if rewrite_ops:
369 old_values, old_shapes = register_rewritten_operators()
370 register_converters()
371 else:
372 old_values, old_shapes = {}, {}
374 def _guess_type_(X, itype, dtype):
375 initial_types = guess_initial_types(X, itype)
376 if dtype is None:
377 if hasattr(X, 'dtypes'): # DataFrame
378 dtype = numpy.float32
379 elif hasattr(X, 'dtype'):
380 dtype = X.dtype
381 elif hasattr(X, 'type'):
382 dtype = guess_numpy_type(X.type)
383 elif initial_types is not None:
384 dtype = guess_numpy_type(initial_types[0][1])
385 else:
386 raise RuntimeError( # pragma: no cover
387 "dtype cannot be guessed: {}".format(
388 type(X)))
389 if dtype != numpy.float64:
390 dtype = numpy.float32
391 if dtype is None:
392 raise RuntimeError("dtype cannot be None") # pragma: no cover
393 if isinstance(dtype, FloatTensorType):
394 dtype = numpy.float32 # pragma: no cover
395 elif isinstance(dtype, DoubleTensorType):
396 dtype = numpy.float64 # pragma: no cover
397 new_dtype = dtype
398 if isinstance(dtype, numpy.ndarray):
399 new_dtype = dtype.dtype # pragma: no cover
400 elif isinstance(dtype, DataType):
401 new_dtype = numpy.float32 # pragma: no cover
402 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64,
403 numpy.int32, numpy.float16):
404 raise NotImplementedError( # pragma: no cover
405 "dtype should be real not {} ({})".format(new_dtype, dtype))
406 return initial_types, dtype, new_dtype
408 if isinstance(model, _PredictScorer):
409 if X is not None and not isinstance(X, OrderedDict):
410 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}."
411 "".format(type(X)))
412 if initial_types is None:
413 dts = []
414 initial_types = []
415 for k, v in X.items():
416 if hasattr(v, 'dtype'):
417 dtype = guess_numpy_type(v.dtype)
418 else:
419 dtype = v # pragma: no cover
420 it, _, ndt = _guess_type_(v, None, dtype)
421 for i in range(len(it)): # pylint: disable=C0200
422 it[i] = (k, it[i][1]) # pylint: disable=C0200
423 initial_types.extend(it)
424 dts.append(ndt)
425 ndt = set(dts)
426 if len(ndt) != 1:
427 raise RuntimeError( # pragma: no cover
428 "Multiple dtype is not efficient {}.".format(ndt))
429 res = convert_scorer(model, initial_types, name=name,
430 target_opset=target_opset, options=options,
431 black_op=black_op, white_op=white_op,
432 final_types=final_types, verbose=verbose)
433 else:
434 if name is None:
435 name = "mlprodict_ONNX(%s)" % model.__class__.__name__
437 initial_types, dtype, _ = _guess_type_(X, initial_types, None)
438 res = convert_sklearn(model, initial_types=initial_types, name=name,
439 target_opset=target_opset, options=options,
440 black_op=black_op, white_op=white_op,
441 final_types=final_types, verbose=verbose)
443 register_rewritten_operators(old_values, old_shapes)
445 # optimisation
446 if rename_strategy is not None:
447 res = onnx_rename_names(res, strategy=rename_strategy)
448 return res