Source code for mlprodict.sklapi.onnx_pipeline

"""
A pipeline which serializes into ONNX steps by steps.


:githublink:`%|py|5`
"""
import numpy
from sklearn.base import clone
from sklearn.pipeline import Pipeline, _fit_transform_one
from sklearn.utils.validation import check_memory
from sklearn.utils import _print_elapsed_time
from ..onnx_conv import to_onnx
from .onnx_transformer import OnnxTransformer


[docs]class OnnxPipeline(Pipeline):
    """
    The pipeline overwrites method *fit*, it trains and converts
    every steps into ONNX before training the next step
    in order to minimize discrepencies. By default,
    ONNX is using float and not double which is the default
    for :epkg:`scikit-learn`. It may introduce discrepencies
    when a non-continuous model (mathematical definition) such
    as tree ensemble and part of the pipeline.

    :param steps:
        List of (name, transform) tuples (implementing fit/transform) that are
        chained, in the order in which they are chained, with the last object
        an estimator.
    :param memory: str or object with the joblib.Memory interface, default=None
        Used to cache the fitted transformers of the pipeline. By default,
        no caching is performed. If a string is given, it is the path to
        the caching directory. Enabling caching triggers a clone of
        the transformers before fitting. Therefore, the transformer
        instance given to the pipeline cannot be inspected
        directly. Use the attribute ``named_steps`` or ``steps`` to
        inspect estimators within the pipeline. Caching the
        transformers is advantageous when fitting is time consuming.
    :param verbose: bool, default=False
        If True, the time elapsed while fitting each step will be printed as it
        is completed.
    :param output_name: string
        requested output name or None to request all and
        have method *transform* to store all of them in a dataframe
    :param enforce_float32: boolean
        :epkg:`onnxruntime` only supports *float32*,
        :epkg:`scikit-learn` usually uses double floats, this parameter
        ensures that every array of double floats is converted into
        single floats
    :param runtime: string, defined the runtime to use
        as described in :class:`OnnxInference <mlprodict.onnxrt.onnx_inference.OnnxInference>`.
    :param options: see :func:`to_onnx <mlprodict.onnx_conv.convert.to_onnx>`
    :param white_op: see :func:`to_onnx <mlprodict.onnx_conv.convert.to_onnx>`
    :param black_op: see :func:`to_onnx <mlprodict.onnx_conv.convert.to_onnx>`
    :param final_types: see :func:`to_onnx <mlprodict.onnx_conv.convert.to_onnx>`
    :param op_version: ONNX targeted opset

    The class stores transformers before converting them into ONNX
    in attributes ``raw_steps_``.

    See notebook :ref:`onnxdiscrepenciesrst` to see it can
    be used to reduce discrepencies after it was converted into
    *ONNX*.


    :githublink:`%|py|62`
    """

[docs]    def __init__(self, steps, *, memory=None, verbose=False,
                 output_name=None, enforce_float32=True,
                 runtime='python', options=None,
                 white_op=None, black_op=None, final_types=None,
                 op_version=None):
        self.output_name = output_name
        self.enforce_float32 = enforce_float32
        self.runtime = runtime
        self.options = options
        self.white_op = white_op
        self.white_op = white_op
        self.black_op = black_op
        self.final_types = final_types
        self.op_version = op_version
        # The constructor calls _validate_step and it checks the value
        # of black_op.
        Pipeline.__init__(
            self, steps, memory=memory, verbose=verbose)

[docs]    def fit(self, X, y=None, **fit_params):
        """
        Fits the model, fits all the transforms one after the
        other and transform the data, then fit the transformed
        data using the final estimator.

        :param X: iterable
            Training data. Must fulfill input requirements of first step of the
            pipeline.
        :param y: iterable, default=None
            Training targets. Must fulfill label requirements for all steps of
            the pipeline.
        :param fit_params: dict of string -> object
            Parameters passed to the ``fit`` method of each step, where
            each parameter name is prefixed such that parameter ``p`` for step
            ``s`` has key ``s__p``.
        :return: self, Pipeline, this estimator


        :githublink:`%|py|100`
        """
        fit_params_steps = self._check_fit_params(**fit_params)
        Xt = self._fit(X, y, **fit_params_steps)
        with _print_elapsed_time('OnnxPipeline',
                                 self._log_message(len(self.steps) - 1)):
            if self._final_estimator != 'passthrough':
                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
                self._final_estimator.fit(Xt, y, **fit_params_last_step)

        return self

[docs]    def _fit(self, X, y=None, **fit_params_steps):
        # shallow copy of steps - this should really be steps_
        if hasattr(self, 'raw_steps_') and self.raw_steps_ is not None:  # pylint: disable=E0203
            # Let's reuse the previous training.
            self.steps = list(self.raw_steps_)  # pylint: disable=E0203
            self.raw_steps_ = list(self.raw_steps_)
        else:
            self.steps = list(self.steps)
            self.raw_steps_ = list(self.steps)

        self._validate_steps()
        # Setup the memory
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(_fit_transform_one)

        for (step_idx,
             name,
             transformer) in self._iter(with_final=False,
                                        filter_passthrough=False):
            if (transformer is None or transformer == 'passthrough'):
                with _print_elapsed_time('Pipeline',
                                         self._log_message(step_idx)):
                    continue

            if hasattr(memory, 'location'):
                # joblib >= 0.12
                if memory.location is None:
                    # we do not clone when caching is disabled to
                    # preserve backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
            else:
                cloned_transformer = clone(transformer)

            # Fit or load from cache the current transformer
            x_train = X
            X, fitted_transformer = fit_transform_one_cached(
                cloned_transformer, X, y, None,
                message_clsname='Pipeline',
                message=self._log_message(step_idx),
                **fit_params_steps[name])
            # Replace the transformer of the step with the fitted
            # transformer. This is necessary when loading the transformer
            # from the cache.
            self.raw_steps_[step_idx] = (name, fitted_transformer)
            self.steps[step_idx] = (
                name, self._to_onnx(name, fitted_transformer, x_train))
        return X

[docs]    def _to_onnx(self, name, fitted_transformer, x_train):
        """
        Converts a transformer into ONNX.

        :param  name:                model name
        :param  fitted_transformer:  fitted transformer
        :param  x_train:             training dataset
        :return:                     corresponding :class:`OnnxTransformer <mlprodict.sklapi.onnx_transformer.OnnxTransformer>`


        :githublink:`%|py|170`
        """
        if not isinstance(x_train, numpy.ndarray):
            raise RuntimeError(  # pragma: no cover
                "The pipeline only handle numpy arrays not {}.".format(
                    type(x_train)))
        atts = {'options', 'white_op', 'black_op', 'final_types'}
        kwargs = {k: getattr(self, k) for k in atts}
        if self.enforce_float32 or x_train.dtype != numpy.float64:
            x_train = x_train.astype(numpy.float32)
        if 'options' in kwargs:
            kwargs['options'] = self._preprocess_options(
                name, kwargs['options'])
        kwargs['target_opset'] = self.op_version
        onx = to_onnx(fitted_transformer, x_train, **kwargs)
        tr = OnnxTransformer(
            onx.SerializeToString(), output_name=self.output_name,
            enforce_float32=self.enforce_float32, runtime=self.runtime)
        return tr.fit()

[docs]    def _preprocess_options(self, name, options):
        """
        Preprocesses the options.

        :param      name:        option name
        :param      options:     conversion options
        :return:                 new options


        :githublink:`%|py|196`
        """
        if options is None:
            return None
        prefix = name + '__'
        new_options = {}
        for k, v in options.items():
            if isinstance(k, str):
                if k.startswith(prefix):
                    new_options[k[len(prefix):]] = v
            else:
                new_options[k] = v
        return new_options
Source code for mlprodict.sklapi.onnx_pipeline

mlprodict

Navigation

Related Topics