TopK benchmark

Links: notebook, html, PDF, python, slides, GitHub

The notebook compares onnxruntime and mlprodict implementation of operator TopK.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline

Plots

We measure two runtimes by computing a ratio between their time execution through the following kind of graphs.

import matplotlib.pyplot as plt
from mlprodict.plotting.plotting import plot_benchmark_metrics


def plot_metric(metric, ax=None, xlabel="N", ylabel="k", middle=1.,
                transpose=False, shrink=1.0, title=None):
    ax, cbar = plot_benchmark_metrics(metric, ax=ax, xlabel=xlabel, ylabel=ylabel,
                                      middle=middle, transpose=transpose,
                                      cbar_kw={'shrink': shrink})
    if title is not None:
        ax.set_title(title)
    return ax


data = {(1, 1): 0.1, (10, 1): 1, (1, 10): 2,
        (10, 10): 100, (100, 1): 100, (100, 10): 1000}

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_metric(data, ax[0], shrink=0.6)
plot_metric(data, ax[1], transpose=True);
../_images/onnx_topk_4_0.png

TopK in ONNX

The following lines creates an ONNX graph using one TopK ONNX node. The outcome is the ONNX graph converted into json.

import numpy
from numpy.random import randn
from skl2onnx.algebra.onnx_ops import OnnxTopK_11

X32 = randn(100000, 100).astype(numpy.float32)

node = OnnxTopK_11('X', numpy.array([5], dtype=numpy.int64),
                   output_names=['dist', 'ind'])

model_onnx = node.to_onnx([('X', X32)], target_opset=12,
                          # shape inference does not seem to work in onnxruntime
                          # so we speccify the output shape
                          outputs=[('dist', X32[:1, :5]),
                                   ('ind', X32[:1, :5].astype(numpy.int64))])
model_onnx
ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.7.1076"
domain: "ai.onnx"
model_version: 0
graph {
  node {
    input: "X"
    input: "To_TopKcst"
    output: "dist"
    output: "ind"
    name: "To_TopK"
    op_type: "TopK"
    domain: ""
  }
  name: "OnnxTopK_11"
  initializer {
    dims: 1
    data_type: 7
    int64_data: 5
    name: "To_TopKcst"
  }
  input {
    name: "X"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 100
          }
        }
      }
    }
  }
  output {
    name: "dist"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 5
          }
        }
      }
    }
  }
  output {
    name: "ind"
    type {
      tensor_type {
        elem_type: 7
        shape {
          dim {
          }
          dim {
            dim_value: 5
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 11
}
from mlprodict.onnxrt import OnnxInference
oinf = OnnxInference(model_onnx, runtime="python")
res = oinf.run({'X': X32})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]
(array([[2.4611652, 1.9387559, 1.7922987, 1.7303178, 1.4945718],
        [3.1443286, 2.580894 , 2.249138 , 1.9770154, 1.9348488]],
       dtype=float32),
 array([[53, 16, 88, 50, 94],
        [27, 55, 53, 35,  8]], dtype=int64))
from mlprodict.tools import get_ir_version_from_onnx
model_onnx.ir_version = get_ir_version_from_onnx()
from onnxruntime import InferenceSession
sess = InferenceSession(model_onnx.SerializeToString())
dist, ind = sess.run(None, {'X': X32})
dist[:2], ind[:2]
(array([[2.4611652, 1.9387559, 1.7922987, 1.7303178, 1.4945718],
        [3.1443286, 2.580894 , 2.249138 , 1.9770154, 1.9348488]],
       dtype=float32),
 array([[53, 16, 88, 50, 94],
        [27, 55, 53, 35,  8]], dtype=int64))

Let’s compare two implementations.

import numpy
import sklearn
from mlprodict.onnxrt.validate.validate_benchmark import benchmark_fct


def benchmark(X, fct1, fct2, N, repeat=10, number=10):

    def ti(n):
        if n <= 1:
            return 50
        if n <= 1000:
            return 2
        if n <= 10000:
            return 0.51
        return 0.11

    # to warm up the engine
    time_kwargs = {n: dict(repeat=10, number=10) for n in N[:2]}
    benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
    benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
    # real measure
    time_kwargs = {n: dict(repeat=int(repeat * ti(n)),
                           number=int(number * ti(n))) for n in N}
    res1 = benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
    res2 = benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)

    res = {}
    for r in sorted(res1):
        r1 = res1[r]
        r2 = res2[r]
        ratio = r2['total'] / r1['total']
        res[r] = ratio
    return res


N = [1, 10, 100, 1000, 10000, 100000]
benchmark(X32, lambda x: sess.run(None, {'X': x}),
          lambda x: oinf.run({'X': x}), N=N)
{1: 1.037517934747998,
 10: 1.0223747854419922,
 100: 0.9193487826763499,
 1000: 0.74829307356075,
 10000: 1.1678574774530992,
 100000: 0.8680940910572188}

The implementation in mlprodict is faster when the number of rows grows. It is faster for 1 rows, for many rows, the implementation uses openmp to parallelize.

C++ implementation vs numpy

scikit-learn uses numpy to compute the top k elements.

from mlprodict.onnxrt.ops_cpu.op_topk import (
    topk_sorted_implementation, topk_sorted_implementation_cpp)

benchmark(X32, lambda x: topk_sorted_implementation(x, 5, 1, 0),
          lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0), N=N)
{1: 0.38271023921401504,
 10: 0.3455020194211096,
 100: 0.23872655955729927,
 1000: 0.16784697368227983,
 10000: 0.2855524199185205,
 100000: 0.42808487120280947}

It seems to be faster too. Let’s profile.

from pyquickhelper.pycode.profiling import profile
from IPython.display import HTML
xr = randn(1000000, 100)
HTML(profile(lambda: topk_sorted_implementation(xr, 5, 1, 0),
             pyinst_format='html')[1])

Parallelisation

We need to disable the parallelisation to really compare both implementation.

from tqdm import tqdm


def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10):
    res = {}
    for k in tqdm(K):
        f1 = lambda x, k=k: fct1(x, k=k)
        f2 = lambda x, k=k: fct2(x, k=k)
        r = benchmark(X32, f1, f2, N=N, repeat=repeat, number=number)
        for n, v in r.items():
            res[n, k] = v
    return res


K = [1, 2, 5, 10, 15]
N = [1, 2, 3, 10, 100, 1000, 10000, 100000]

bench_para = benchmark_test(
    X32, (lambda x, k: topk_sorted_implementation_cpp(x, k=k, axis=1, largest=0, th_para=100000000)),
    (lambda x, k: topk_sorted_implementation_cpp(x, k=k, axis=1, largest=0, th_para=1)),
    N=N, K=K)
100%|██████████| 5/5 [00:33<00:00,  6.67s/it]
plot_metric(bench_para, transpose=False, title="TopK and parallelisation\n"
            "< 1 means parallelisation is faster", shrink=0.75);
../_images/onnx_topk_19_0.png

This is somehow expected. First column is closed to 1 as it is the same code being compared. Next columns are red, meaning the parallelisation does not help, the parallelisation helps for bigger N, as least more than 100.

Parallellisation with ONNX

We replicate the same experiment with an ONNX graph.

k_ = numpy.array([3], dtype=numpy.int64)
node = OnnxTopK_11('X', 'k',
                   output_names=['dist', 'ind'])

model_onnx = node.to_onnx([('X', X32), ('k', k_)], target_opset=12,
                          # shape inference does not seem to work in onnxruntime
                          # so we speccify the output shape
                          outputs=[('dist', X32[:1, :5]),
                                   ('ind', X32[:1, :5].astype(numpy.int64))])
oinf_no_para = OnnxInference(model_onnx, runtime="python")
res = oinf_no_para.run({'X': X32, 'k': k_})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]
(array([[2.4611652, 1.9387559, 1.7922987],
        [3.1443286, 2.580894 , 2.249138 ]], dtype=float32),
 array([[53, 16, 88],
        [27, 55, 53]], dtype=int64))
oinf_para = OnnxInference(model_onnx, runtime="python")
oinf_no_para.sequence_[0].ops_.th_para = 100000000
oinf_para.sequence_[0].ops_.th_para = 1
bench_onnx_para = benchmark_test(
    X32, (lambda x, k: oinf_no_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
100%|██████████| 5/5 [01:10<00:00, 14.20s/it]
plot_metric(bench_onnx_para, transpose=False, title="TopK and parallelisation with ONNX\n"
            "< 1 means parallelisation is faster", shrink=0.75);
../_images/onnx_topk_26_0.png

Pretty much the same results.

onnxruntime vs mlprodict (no parallelisation)

model_onnx.ir_version = get_ir_version_from_onnx()
sess = InferenceSession(model_onnx.SerializeToString())
bench_ort = benchmark_test(
    X32, (lambda x, k: sess.run(None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_no_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
100%|██████████| 5/5 [01:14<00:00, 14.86s/it]
plot_metric(bench_ort, transpose=False, title="TopK, onnxruntime vs mlprodict\n"
            "< 1 means mlprodict is faster\nno parallelisation", shrink=0.75);
../_images/onnx_topk_31_0.png

It seems the implement of operator TopK in onnxruntime 1.1.1 can be improved.

from onnxruntime import __version__ as ort_version
from mlprodict import __version__ as mlp_version
ort_version, mlp_version
('1.3.995', '0.4.1188')

And with parallelisation above 50 rows.

oinf_para.sequence_[0].ops_.th_para = 50
bench_ort_para = benchmark_test(
    X32, (lambda x, k: sess.run(None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
100%|██████████| 5/5 [00:59<00:00, 11.93s/it]
plot_metric(bench_ort_para, transpose=False, title="TopK, onnxruntime vs mlprodict\n"
            "< 1 means mlprodict is faster\nparallelisation above 50 rows", shrink=0.75);
../_images/onnx_topk_36_0.png

Interesting…

Comparison with onnxruntime

from skl2onnx.algebra.onnx_ops import OnnxTopK
from skl2onnx.common.data_types import FloatTensorType
from mlprodict.onnx_conv import to_onnx
from onnxruntime import InferenceSession


X = numpy.array([
    [0, 1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10, 11],
], dtype=numpy.float32)

K = numpy.array([3], dtype=numpy.int64)


node = OnnxTopK('X', K, output_names=['values', 'indices'],
                op_version=12)
onx = node.to_onnx([('X', FloatTensorType())])

py_topk = OnnxInference(onx, runtime="python_compiled")
ort_topk = InferenceSession(onx.SerializeToString())
py_topk.run({'X': X})
{'values': array([[ 3.,  2.,  1.],
        [ 7.,  6.,  5.],
        [11., 10.,  9.]], dtype=float32),
 'indices': array([[3, 2, 1],
        [3, 2, 1],
        [3, 2, 1]], dtype=int64)}
ort_topk.run(None, {'X': X})
[array([[ 3.,  2.,  1.],
        [ 7.,  6.,  5.],
        [11., 10.,  9.]], dtype=float32),
 array([[3, 2, 1],
        [3, 2, 1],
        [3, 2, 1]], dtype=int64)]
%timeit py_topk.run({'X': X})
10.7 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit ort_topk.run(None, {'X': X})
14.6 µs ± 629 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
X = numpy.random.randn(10000, 100).astype(numpy.float32)
%timeit py_topk.run({'X': X})
1.76 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit ort_topk.run(None, {'X': X})
2.25 ms ± 97.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)