Source code for mlprodict.cli.convert_validate

"""
Command line about validation of prediction runtime.


:githublink:`%|py|5`
"""
import os
import pickle
from logging import getLogger
import warnings
from pandas import read_csv
from skl2onnx.common.data_types import FloatTensorType, DoubleTensorType
from ..onnx_conv import to_onnx
from ..onnxrt import OnnxInference
from ..onnxrt.optim import onnx_optimisations
from ..onnxrt.validate.validate_difference import measure_relative_difference
from ..onnx_conv import guess_schema_from_data, guess_schema_from_model


[docs]def convert_validate(pkl, data=None, schema=None,
                     method="predict", name='Y',
                     target_opset=None,
                     outonnx="model.onnx",
                     runtime='python', metric="l1med",
                     use_double=None, noshape=False,
                     optim='onnx', rewrite_ops=True,
                     options=None, fLOG=print, verbose=1,
                     register=True):
    """
    Converts a model stored in *pkl* file and measure the differences
    between the model and the ONNX predictions.

    :param pkl: pickle file
    :param data: data file, loaded with pandas,
        converted to a single array, the data is used to guess
        the schema if *schema* not specified
    :param schema: initial type of the model
    :param method: method to call
    :param name: output name
    :param target_opset: target opset
    :param outonnx: produced ONNX model
    :param runtime: runtime to use to compute predictions,
        'python', 'python_compiled',
        'onnxruntime1' or 'onnxruntime2'
    :param metric: the metric 'l1med' is given by function
        :func:`measure_relative_difference
        <mlprodict.onnxrt.validate.validate_difference.measure_relative_difference>`
    :param noshape: run the conversion with no shape information
    :param use_double: use double for the runtime if possible,
        two possible options, ``"float64"`` or ``'switch'``,
        the first option produces an ONNX file with doubles,
        the second option loads an ONNX file (float or double)
        and replaces matrices in ONNX with the matrices coming from
        the model, this second way is just for testing purposes
    :param optim: applies optimisations on the first ONNX graph,
        use 'onnx' to reduce the number of node Identity and
        redundant subgraphs
    :param rewrite_ops: rewrites some converters from skl2onnx
    :param options: additional options for conversion,
        dictionary as a string
    :param verbose: verbose level
    :param register: registers additional converters implemented by this package
    :param fLOG: logging function
    :return: a dictionary with the results

    .. cmdref::
        :title: Converts and compares an ONNX file
        :cmd: -m mlprodict convert_validate --help
        :lid: l-cmd-convert_validate

        The command converts and validates a :epkg:`scikit-learn` model.
        An example to check the prediction of a logistic regression.

        ::

            import os
            import pickle
            import pandas
            from sklearn.datasets import load_iris
            from sklearn.model_selection import train_test_split
            from sklearn.linear_model import LogisticRegression
            from mlprodict.__main__ import main
            from mlprodict.cli import convert_validate

            iris = load_iris()
            X, y = iris.data, iris.target
            X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
            clr = LogisticRegression()
            clr.fit(X_train, y_train)

            pandas.DataFrame(X_test).to_csv("data.csv", index=False)
            with open("model.pkl", "wb") as f:
                pickle.dump(clr, f)

        And the command line to check the predictions
        using a command line.

        ::

            convert_validate --pkl model.pkl --data data.csv
                             --method predict,predict_proba
                             --name output_label,output_probability
                             --verbose 1


    :githublink:`%|py|102`
    """
    if fLOG is None:
        verbose = 0  # pragma: no cover
    if use_double not in (None, 'float64', 'switch'):
        raise ValueError(  # pragma: no cover
            "use_double must be either None, 'float64' or 'switch'")
    if optim == '':
        optim = None  # pragma: no cover
    if target_opset == '':
        target_opset = None  # pragma: no cover
    if verbose == 0:
        logger = getLogger('skl2onnx')
        logger.disabled = True
    if not os.path.exists(pkl):
        raise FileNotFoundError(  # pragma: no cover
            "Unable to find model '{}'.".format(pkl))
    if os.path.exists(outonnx):
        warnings.warn("File '{}' will be overwritten.".format(outonnx))
    if verbose > 0:
        fLOG("[convert_validate] load model '{}'".format(pkl))
    with open(pkl, "rb") as f:
        model = pickle.load(f)

    if use_double == 'float64':
        tensor_type = DoubleTensorType
    else:
        tensor_type = FloatTensorType
    if options in (None, ''):
        options = None
    else:
        from ..onnxrt.validate.validate_scenarios import (
            interpret_options_from_string)
        options = interpret_options_from_string(options)
        if verbose > 0:
            fLOG("[convert_validate] options={}".format(repr(options)))

    if register:
        from ..onnx_conv import (
            register_converters, register_rewritten_operators)
        register_converters()
        register_rewritten_operators()

    # data and schema
    if data is None or not os.path.exists(data):
        if schema is None:
            schema = guess_schema_from_model(model, tensor_type)
        if verbose > 0:
            fLOG("[convert_validate] model schema={}".format(schema))
        df = None
    else:
        if verbose > 0:
            fLOG("[convert_validate] load data '{}'".format(data))
        df = read_csv(data)
        if verbose > 0:
            fLOG("[convert_validate] convert data into matrix")
        if schema is None:
            schema = guess_schema_from_data(df, tensor_type)
        if schema is None:
            schema = [('X', tensor_type([None, df.shape[1]]))]
        if len(schema) == 1:
            df = df.values
        if verbose > 0:
            fLOG("[convert_validate] data schema={}".format(schema))

    if noshape:
        if verbose > 0:
            fLOG(  # pragma: no cover
                "[convert_validate] convert the model with no shape information")
        schema = [(name, col.__class__([None, None])) for name, col in schema]
        onx = to_onnx(
            model, initial_types=schema, rewrite_ops=rewrite_ops,
            target_opset=target_opset, options=options)
    else:
        if verbose > 0:
            fLOG("[convert_validate] convert the model with shapes")
        onx = to_onnx(
            model, initial_types=schema, target_opset=target_opset,
            rewrite_ops=rewrite_ops, options=options)

    if optim is not None:
        if verbose > 0:
            fLOG("[convert_validate] run optimisations '{}'".format(optim))
        onx = onnx_optimisations(onx, optim=optim)
    if verbose > 0:
        fLOG("[convert_validate] saves to '{}'".format(outonnx))
    memory = onx.SerializeToString()
    with open(outonnx, 'wb') as f:
        f.write(memory)

    if verbose > 0:
        fLOG("[convert_validate] creates OnnxInference session")
    sess = OnnxInference(onx, runtime=runtime)
    if use_double == "switch":
        if verbose > 0:
            fLOG("[convert_validate] switch to double")
        sess.switch_initializers_dtype(model)

    if verbose > 0:
        fLOG("[convert_validate] compute prediction from model")

    if ',' in method:
        methods = method.split(',')
    else:
        methods = [method]
    if ',' in name:
        names = name.split(',')
    else:
        names = [name]

    if len(names) != len(methods):
        raise ValueError(
            "Number of methods and outputs do not match: {}, {}".format(
                names, methods))

    if metric != 'l1med':
        raise ValueError(  # pragma: no cover
            "Unknown metric '{}'".format(metric))

    if df is None:
        # no test on data
        return dict(onnx=memory)

    if verbose > 0:
        fLOG("[convert_validate] compute predictions from ONNX with name '{}'"
             "".format(name))

    ort_preds = sess.run(
        {'X': df}, verbose=max(verbose - 1, 0), fLOG=fLOG)

    metrics = []
    out_skl_preds = []
    out_ort_preds = []
    for method_, name_ in zip(methods, names):
        if verbose > 0:
            fLOG("[convert_validate] compute predictions with method '{}'".format(
                method_))
        meth = getattr(model, method_)
        skl_pred = meth(df)
        out_skl_preds.append(df)

        if name_ not in ort_preds:
            raise KeyError(
                "Unable to find output name '{}' in {}".format(
                    name_, list(sorted(ort_preds))))

        ort_pred = ort_preds[name_]
        out_ort_preds.append(ort_pred)
        diff = measure_relative_difference(skl_pred, ort_pred)
        if verbose > 0:
            fLOG("[convert_validate] {}={}".format(metric, diff))
        metrics.append(diff)

    return dict(skl_pred=out_skl_preds, ort_pred=out_ort_preds,
                metrics=metrics, onnx=memory)
Source code for mlprodict.cli.convert_validate

mlprodict

Navigation

Related Topics