Benchmark (ONNX) for ReduceSum#

The experiment compares the execution time between numpy and onnxruntime for operator ReduceSum.

Overview#

(Source code)

Configuration#

<<<

from pyquickhelper.pandashelper import df2rst
import pandas
name = os.path.join(
    __WD__, "../../onnx/results/bench_plot_onnxruntime_reduce_sum.time.csv")
df = pandas.read_csv(name)
print(df2rst(df, number_format=4))

>>>

name

version

value

date

2020-10-02

python

3.7.2 (default, Mar 1 2019, 18:34:21) [GCC 6.3.0 20170516]

platform

linux

OS

Linux-4.9.0-8-amd64-x86_64-with-debian-9.6

machine

x86_64

processor

release

4.9.0-8-amd64

architecture

(‘64bit’, ‘’)

mlprodict

0.4.1308

numpy

1.18.5

openblas, language=c

onnx

1.7.1078

opset=12

onnxruntime

1.5.991

CPU-MKL-ML

pandas

1.1.2

skl2onnx

1.7.1082

sklearn

0.23.2

Raw results#

bench_plot_onnxruntime_reduce_sum.csv

<<<

from pyquickhelper.pandashelper import df2rst
from pymlbenchmark.benchmark.bench_helper import bench_pivot
import pandas
name = os.path.join(
    __WD__, "../../onnx/results/bench_plot_onnxruntime_reduce_sum.perf.csv")
df = pandas.read_csv(name)
piv = bench_pivot(df).reset_index(drop=False)
piv['speedup'] = piv['npy'] / piv['ort']
print(df2rst(piv, number_format=4))

Benchmark code#

bench_plot_onnxruntime_reduce_sum.py

# coding: utf-8
"""
Benchmark of :epkg:`onnxruntime` for
`ReduceSum <https://github.com/onnx/onnx/blob/master/docs/Operators.md#ReduceSum>`_.
"""
# Authors: Xavier Dupré (benchmark)
# License: MIT
import matplotlib
matplotlib.use('Agg')

import os
import unittest
import warnings
import contextlib
from time import perf_counter as time
from io import StringIO
import numpy
import pandas
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils._testing import ignore_warnings
from sklearn.utils.extmath import softmax
from pyquickhelper.loghelper import run_cmd, sys_path_append
from pymlbenchmark.benchmark import BenchPerfTest, BenchPerf
from pymlbenchmark.context import machine_information
from pymlbenchmark.plotting import plot_bench_results
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxReduceSumApi11
from onnxruntime import InferenceSession
from onnxruntime.capi.onnxruntime_pybind11_state import Fail
from mlprodict.onnxrt import OnnxInference
from mlprodict import __max_supported_opset__, get_ir_version

TARGET_OPSET = 13

################################
# Benchmark
# +++++++++


def generate_onnx_graph(edims, axes, input_name='X'):
    """Generates a series of consecutive additions."""

    node = OnnxReduceSumApi11(input_name, axes=list(axes),
                              op_version=TARGET_OPSET,
                              output_names=['Y'])
    onx = node.to_onnx([(input_name, FloatTensorType((None,) + tuple(edims)))],
                       outputs=[('Y', FloatTensorType())],
                       target_opset=TARGET_OPSET)
    return onx


class GraphOrtBenchPerfTest(BenchPerfTest):
    def __init__(self, edims=(1000, 1000), axes=(1, )):
        BenchPerfTest.__init__(self)
        self.input_name = 'X'
        self.edims = edims
        self.axes = axes
        self.onx = generate_onnx_graph(edims, axes, self.input_name)
        as_string = self.onx.SerializeToString()
        try:
            self.ort = InferenceSession(as_string)
        except Fail as e:
            raise RuntimeError(
                "Issue\n{}".format(self.onx)) from e
        self.rtpy = OnnxInference(as_string, runtime='python_compiled')

        try:
            import tensorflow as tf
        except ImportError:
            tf = None
        self.tf = tf
        try:
            import torch
        except ImportError:
            torch = None
        self.torch = torch

    def fcts(self, **kwargs):

        def predict_ort(X, Xtf, Xtr, model=self.ort):
            return self.ort.run(None, {self.input_name: X})[0]

        def predict_rtpy(X, Xtf, Xtr, model=self.ort):
            return self.rtpy.run({self.input_name: X})['Y']

        def predict_npy(X, Xtf, Xtr):
            return numpy.sum(X, axis=self.axes)

        fcts = [{'lib': 'ort', 'fct': predict_ort},
                {'lib': 'npy', 'fct': predict_npy},
                {'lib': 'rtpy', 'fct': predict_rtpy}]

        if self.tf is not None:

            def predict_tf(X, Xtf, Xtr):
                return self.tf.math.reduce_sum(Xtf, axis=self.axes)

            fcts.append({'lib': 'tf', 'fct': predict_tf})

        if self.torch is not None:

            def predict_torch(X, Xtf, Xtr):
                return self.torch.sum(Xtr, dim=self.axes)

            fcts.append({'lib': 'torch', 'fct': predict_torch})

        return fcts

    def data(self, N=10, edims=None, **kwargs):  # pylint: disable=W0221
        new_dims = list((N,) + tuple(edims))
        arr = numpy.random.rand(*new_dims).astype(numpy.float32)
        tfarr = None if self.tf is None else self.tf.convert_to_tensor(arr)
        trarr = None if self.torch is None else self.torch.Tensor(arr)
        return (arr, tfarr, trarr)


def fct_filter_test(N=None, edims=None, axes=None):
    if axes is None:
        return True
    for a in axes:
        if a > len(edims):
            return False
    return True


@ignore_warnings(category=FutureWarning)
def run_bench(repeat=20, number=10, verbose=False):

    pbefore = dict(edims=[(10, 10), (100, 50), (50, 20, 5)],
                   axes=[(1, ), (2, )])
    pafter = dict(N=[1, 10, 100, 1000, 2000, 5000])

    test = lambda edims=None, axes=None, **opts: GraphOrtBenchPerfTest(
        edims=edims, axes=axes, **opts)
    bp = BenchPerf(pbefore, pafter, test,
                   filter_test=fct_filter_test)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(bp.enumerate_run_benchs(repeat=repeat, verbose=verbose,
                                               number=number, stop_if_error=False))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df

#########################
# Runs the benchmark
# ++++++++++++++++++


filename = os.path.splitext(os.path.split(__file__)[-1])[0]
df = run_bench(verbose=True)
df.to_csv("%s.perf.csv" % filename, index=False)
print(df.head())

#########################
# Extracts information about the machine used
# +++++++++++++++++++++++++++++++++++++++++++

pkgs = ['numpy', 'pandas', 'sklearn', 'skl2onnx',
        'onnxruntime', 'onnx', 'mlprodict']
dfi = pandas.DataFrame(machine_information(pkgs))
dfi.to_csv("%s.time.csv" % filename, index=False)
print(dfi)


#############################
# Plot the results by number of nodes
# +++++++++++++++++++++++++++++++++++

def label_fct(la):
    la = la.replace("onxpython_compiled", "opy")
    la = la.replace("onxpython", "opy")
    la = la.replace("onxonnxruntime1", "ort")
    la = la.replace("edims=(", "edims=(N, ")
    return la


from pymlbenchmark.plotting import plot_bench_results
plot_bench_results(df, row_cols='edims', col_cols='axes',
                   x_value='N', cmp_col_values=('lib', 'npy'),
                   title="Benchmark ReduceSum",
                   label_fct=label_fct)

plt.savefig("%s.node.png" % filename)

import sys
if "--quiet" not in sys.argv:
    plt.show()