TopK benchmark¶
Links: notebook
, html, PDF
, python
, slides, GitHub
The notebook compares onnxruntime and mlprodict implementation of operator TopK.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline
Plots¶
We measure two runtimes by computing a ratio between their time execution through the following kind of graphs.
import matplotlib.pyplot as plt
from mlprodict.plotting.plotting import plot_benchmark_metrics
def plot_metric(metric, ax=None, xlabel="N", ylabel="k", middle=1.,
transpose=False, shrink=1.0, title=None):
ax, cbar = plot_benchmark_metrics(metric, ax=ax, xlabel=xlabel, ylabel=ylabel,
middle=middle, transpose=transpose,
cbar_kw={'shrink': shrink})
if title is not None:
ax.set_title(title)
return ax
data = {(1, 1): 0.1, (10, 1): 1, (1, 10): 2,
(10, 10): 100, (100, 1): 100, (100, 10): 1000}
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_metric(data, ax[0], shrink=0.6)
plot_metric(data, ax[1], transpose=True);

TopK in ONNX¶
The following lines creates an ONNX graph using one TopK ONNX node. The outcome is the ONNX graph converted into json.
import numpy
from numpy.random import randn
from skl2onnx.algebra.onnx_ops import OnnxTopK_11
X32 = randn(100000, 100).astype(numpy.float32)
node = OnnxTopK_11('X', numpy.array([5], dtype=numpy.int64),
output_names=['dist', 'ind'])
model_onnx = node.to_onnx([('X', X32)], target_opset=12,
# shape inference does not seem to work in onnxruntime
# so we speccify the output shape
outputs=[('dist', X32[:1, :5]),
('ind', X32[:1, :5].astype(numpy.int64))])
model_onnx
ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.7.1076"
domain: "ai.onnx"
model_version: 0
graph {
node {
input: "X"
input: "To_TopKcst"
output: "dist"
output: "ind"
name: "To_TopK"
op_type: "TopK"
domain: ""
}
name: "OnnxTopK_11"
initializer {
dims: 1
data_type: 7
int64_data: 5
name: "To_TopKcst"
}
input {
name: "X"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 100
}
}
}
}
}
output {
name: "dist"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 5
}
}
}
}
}
output {
name: "ind"
type {
tensor_type {
elem_type: 7
shape {
dim {
}
dim {
dim_value: 5
}
}
}
}
}
}
opset_import {
domain: ""
version: 11
}
from mlprodict.onnxrt import OnnxInference
oinf = OnnxInference(model_onnx, runtime="python")
res = oinf.run({'X': X32})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]
(array([[2.4611652, 1.9387559, 1.7922987, 1.7303178, 1.4945718],
[3.1443286, 2.580894 , 2.249138 , 1.9770154, 1.9348488]],
dtype=float32),
array([[53, 16, 88, 50, 94],
[27, 55, 53, 35, 8]], dtype=int64))
from mlprodict.tools import get_ir_version_from_onnx
model_onnx.ir_version = get_ir_version_from_onnx()
from onnxruntime import InferenceSession
sess = InferenceSession(model_onnx.SerializeToString())
dist, ind = sess.run(None, {'X': X32})
dist[:2], ind[:2]
(array([[2.4611652, 1.9387559, 1.7922987, 1.7303178, 1.4945718],
[3.1443286, 2.580894 , 2.249138 , 1.9770154, 1.9348488]],
dtype=float32),
array([[53, 16, 88, 50, 94],
[27, 55, 53, 35, 8]], dtype=int64))
Let’s compare two implementations.
import numpy
import sklearn
from mlprodict.onnxrt.validate.validate_benchmark import benchmark_fct
def benchmark(X, fct1, fct2, N, repeat=10, number=10):
def ti(n):
if n <= 1:
return 50
if n <= 1000:
return 2
if n <= 10000:
return 0.51
return 0.11
# to warm up the engine
time_kwargs = {n: dict(repeat=10, number=10) for n in N[:2]}
benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
# real measure
time_kwargs = {n: dict(repeat=int(repeat * ti(n)),
number=int(number * ti(n))) for n in N}
res1 = benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
res2 = benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
res = {}
for r in sorted(res1):
r1 = res1[r]
r2 = res2[r]
ratio = r2['total'] / r1['total']
res[r] = ratio
return res
N = [1, 10, 100, 1000, 10000, 100000]
benchmark(X32, lambda x: sess.run(None, {'X': x}),
lambda x: oinf.run({'X': x}), N=N)
{1: 1.037517934747998,
10: 1.0223747854419922,
100: 0.9193487826763499,
1000: 0.74829307356075,
10000: 1.1678574774530992,
100000: 0.8680940910572188}
The implementation in mlprodict is faster when the number of rows grows. It is faster for 1 rows, for many rows, the implementation uses openmp to parallelize.
C++ implementation vs numpy¶
scikit-learn uses numpy to compute the top k elements.
from mlprodict.onnxrt.ops_cpu.op_topk import (
topk_sorted_implementation, topk_sorted_implementation_cpp)
benchmark(X32, lambda x: topk_sorted_implementation(x, 5, 1, 0),
lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0), N=N)
{1: 0.38271023921401504,
10: 0.3455020194211096,
100: 0.23872655955729927,
1000: 0.16784697368227983,
10000: 0.2855524199185205,
100000: 0.42808487120280947}
It seems to be faster too. Let’s profile.
from pyquickhelper.pycode.profiling import profile
from IPython.display import HTML
xr = randn(1000000, 100)
HTML(profile(lambda: topk_sorted_implementation(xr, 5, 1, 0),
pyinst_format='html')[1])
Parallelisation¶
We need to disable the parallelisation to really compare both implementation.
from tqdm import tqdm
def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10):
res = {}
for k in tqdm(K):
f1 = lambda x, k=k: fct1(x, k=k)
f2 = lambda x, k=k: fct2(x, k=k)
r = benchmark(X32, f1, f2, N=N, repeat=repeat, number=number)
for n, v in r.items():
res[n, k] = v
return res
K = [1, 2, 5, 10, 15]
N = [1, 2, 3, 10, 100, 1000, 10000, 100000]
bench_para = benchmark_test(
X32, (lambda x, k: topk_sorted_implementation_cpp(x, k=k, axis=1, largest=0, th_para=100000000)),
(lambda x, k: topk_sorted_implementation_cpp(x, k=k, axis=1, largest=0, th_para=1)),
N=N, K=K)
100%|██████████| 5/5 [00:33<00:00, 6.67s/it]
plot_metric(bench_para, transpose=False, title="TopK and parallelisation\n"
"< 1 means parallelisation is faster", shrink=0.75);

This is somehow expected. First column is closed to 1 as it is the same code being compared. Next columns are red, meaning the parallelisation does not help, the parallelisation helps for bigger N, as least more than 100.
Parallellisation with ONNX¶
We replicate the same experiment with an ONNX graph.
k_ = numpy.array([3], dtype=numpy.int64)
node = OnnxTopK_11('X', 'k',
output_names=['dist', 'ind'])
model_onnx = node.to_onnx([('X', X32), ('k', k_)], target_opset=12,
# shape inference does not seem to work in onnxruntime
# so we speccify the output shape
outputs=[('dist', X32[:1, :5]),
('ind', X32[:1, :5].astype(numpy.int64))])
oinf_no_para = OnnxInference(model_onnx, runtime="python")
res = oinf_no_para.run({'X': X32, 'k': k_})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]
(array([[2.4611652, 1.9387559, 1.7922987],
[3.1443286, 2.580894 , 2.249138 ]], dtype=float32),
array([[53, 16, 88],
[27, 55, 53]], dtype=int64))
oinf_para = OnnxInference(model_onnx, runtime="python")
oinf_no_para.sequence_[0].ops_.th_para = 100000000
oinf_para.sequence_[0].ops_.th_para = 1
bench_onnx_para = benchmark_test(
X32, (lambda x, k: oinf_no_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
(lambda x, k: oinf_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
N=N, K=K)
100%|██████████| 5/5 [01:10<00:00, 14.20s/it]
plot_metric(bench_onnx_para, transpose=False, title="TopK and parallelisation with ONNX\n"
"< 1 means parallelisation is faster", shrink=0.75);

Pretty much the same results.
onnxruntime vs mlprodict (no parallelisation)¶
model_onnx.ir_version = get_ir_version_from_onnx()
sess = InferenceSession(model_onnx.SerializeToString())
bench_ort = benchmark_test(
X32, (lambda x, k: sess.run(None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
(lambda x, k: oinf_no_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
N=N, K=K)
100%|██████████| 5/5 [01:14<00:00, 14.86s/it]
plot_metric(bench_ort, transpose=False, title="TopK, onnxruntime vs mlprodict\n"
"< 1 means mlprodict is faster\nno parallelisation", shrink=0.75);

It seems the implement of operator TopK in onnxruntime 1.1.1 can be improved.
from onnxruntime import __version__ as ort_version
from mlprodict import __version__ as mlp_version
ort_version, mlp_version
('1.3.995', '0.4.1188')
And with parallelisation above 50 rows.
oinf_para.sequence_[0].ops_.th_para = 50
bench_ort_para = benchmark_test(
X32, (lambda x, k: sess.run(None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
(lambda x, k: oinf_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
N=N, K=K)
100%|██████████| 5/5 [00:59<00:00, 11.93s/it]
plot_metric(bench_ort_para, transpose=False, title="TopK, onnxruntime vs mlprodict\n"
"< 1 means mlprodict is faster\nparallelisation above 50 rows", shrink=0.75);

Interesting…
Comparison with onnxruntime¶
from skl2onnx.algebra.onnx_ops import OnnxTopK
from skl2onnx.common.data_types import FloatTensorType
from mlprodict.onnx_conv import to_onnx
from onnxruntime import InferenceSession
X = numpy.array([
[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11],
], dtype=numpy.float32)
K = numpy.array([3], dtype=numpy.int64)
node = OnnxTopK('X', K, output_names=['values', 'indices'],
op_version=12)
onx = node.to_onnx([('X', FloatTensorType())])
py_topk = OnnxInference(onx, runtime="python_compiled")
ort_topk = InferenceSession(onx.SerializeToString())
py_topk.run({'X': X})
{'values': array([[ 3., 2., 1.],
[ 7., 6., 5.],
[11., 10., 9.]], dtype=float32),
'indices': array([[3, 2, 1],
[3, 2, 1],
[3, 2, 1]], dtype=int64)}
ort_topk.run(None, {'X': X})
[array([[ 3., 2., 1.],
[ 7., 6., 5.],
[11., 10., 9.]], dtype=float32),
array([[3, 2, 1],
[3, 2, 1],
[3, 2, 1]], dtype=int64)]
%timeit py_topk.run({'X': X})
10.7 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit ort_topk.run(None, {'X': X})
14.6 µs ± 629 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
X = numpy.random.randn(10000, 100).astype(numpy.float32)
%timeit py_topk.run({'X': X})
1.76 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit ort_topk.run(None, {'X': X})
2.25 ms ± 97.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)