# TopK benchmark¶

The notebook compares onnxruntime and mlprodict implementation of operator TopK.

```from jyquickhelper import add_notebook_menu
```
```%matplotlib inline
```

## Plots¶

We measure two runtimes by computing a ratio between their time execution through the following kind of graphs.

```import matplotlib.pyplot as plt
from mlprodict.plotting.plotting import plot_benchmark_metrics

def plot_metric(metric, ax=None, xlabel="N", ylabel="k", middle=1.,
transpose=False, shrink=1.0, title=None):
ax, cbar = plot_benchmark_metrics(metric, ax=ax, xlabel=xlabel, ylabel=ylabel,
middle=middle, transpose=transpose,
cbar_kw={'shrink': shrink})
if title is not None:
ax.set_title(title)
return ax

data = {(1, 1): 0.1, (10, 1): 1, (1, 10): 2,
(10, 10): 100, (100, 1): 100, (100, 10): 1000}

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_metric(data, ax[0], shrink=0.6)
plot_metric(data, ax[1], transpose=True);
```

## TopK in ONNX¶

The following lines creates an ONNX graph using one TopK ONNX node. The outcome is the ONNX graph converted into json.

```import numpy
from numpy.random import randn
from skl2onnx.algebra.onnx_ops import OnnxTopK_11

X32 = randn(100000, 100).astype(numpy.float32)

node = OnnxTopK_11('X', numpy.array([5], dtype=numpy.int64),
output_names=['dist', 'ind'])

model_onnx = node.to_onnx([('X', X32)], target_opset=12,
# shape inference does not seem to work in onnxruntime
# so we speccify the output shape
outputs=[('dist', X32[:1, :5]),
('ind', X32[:1, :5].astype(numpy.int64))])
model_onnx
```
```ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.7.1076"
domain: "ai.onnx"
model_version: 0
graph {
node {
input: "X"
input: "To_TopKcst"
output: "dist"
output: "ind"
name: "To_TopK"
op_type: "TopK"
domain: ""
}
name: "OnnxTopK_11"
initializer {
dims: 1
data_type: 7
int64_data: 5
name: "To_TopKcst"
}
input {
name: "X"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 100
}
}
}
}
}
output {
name: "dist"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 5
}
}
}
}
}
output {
name: "ind"
type {
tensor_type {
elem_type: 7
shape {
dim {
}
dim {
dim_value: 5
}
}
}
}
}
}
opset_import {
domain: ""
version: 11
}
```
```from mlprodict.onnxrt import OnnxInference
oinf = OnnxInference(model_onnx, runtime="python")
res = oinf.run({'X': X32})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]
```
```(array([[2.4611652, 1.9387559, 1.7922987, 1.7303178, 1.4945718],
[3.1443286, 2.580894 , 2.249138 , 1.9770154, 1.9348488]],
dtype=float32),
array([[53, 16, 88, 50, 94],
[27, 55, 53, 35,  8]], dtype=int64))
```
```from mlprodict.tools import get_ir_version_from_onnx
model_onnx.ir_version = get_ir_version_from_onnx()
```
```from onnxruntime import InferenceSession
sess = InferenceSession(model_onnx.SerializeToString())
dist, ind = sess.run(None, {'X': X32})
dist[:2], ind[:2]
```
```(array([[2.4611652, 1.9387559, 1.7922987, 1.7303178, 1.4945718],
[3.1443286, 2.580894 , 2.249138 , 1.9770154, 1.9348488]],
dtype=float32),
array([[53, 16, 88, 50, 94],
[27, 55, 53, 35,  8]], dtype=int64))
```

Let’s compare two implementations.

```import numpy
import sklearn
from mlprodict.onnxrt.validate.validate_benchmark import benchmark_fct

def benchmark(X, fct1, fct2, N, repeat=10, number=10):

def ti(n):
if n <= 1:
return 50
if n <= 1000:
return 2
if n <= 10000:
return 0.51
return 0.11

# to warm up the engine
time_kwargs = {n: dict(repeat=10, number=10) for n in N[:2]}
benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
# real measure
time_kwargs = {n: dict(repeat=int(repeat * ti(n)),
number=int(number * ti(n))) for n in N}
res1 = benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
res2 = benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)

res = {}
for r in sorted(res1):
r1 = res1[r]
r2 = res2[r]
ratio = r2['total'] / r1['total']
res[r] = ratio
return res

N = [1, 10, 100, 1000, 10000, 100000]
benchmark(X32, lambda x: sess.run(None, {'X': x}),
lambda x: oinf.run({'X': x}), N=N)
```
```{1: 1.037517934747998,
10: 1.0223747854419922,
100: 0.9193487826763499,
1000: 0.74829307356075,
10000: 1.1678574774530992,
100000: 0.8680940910572188}
```

The implementation in mlprodict is faster when the number of rows grows. It is faster for 1 rows, for many rows, the implementation uses openmp to parallelize.

## C++ implementation vs numpy¶

scikit-learn uses numpy to compute the top k elements.

```from mlprodict.onnxrt.ops_cpu.op_topk import (
topk_sorted_implementation, topk_sorted_implementation_cpp)

benchmark(X32, lambda x: topk_sorted_implementation(x, 5, 1, 0),
lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0), N=N)
```
```{1: 0.38271023921401504,
10: 0.3455020194211096,
100: 0.23872655955729927,
1000: 0.16784697368227983,
10000: 0.2855524199185205,
100000: 0.42808487120280947}
```

It seems to be faster too. Let’s profile.

```from pyquickhelper.pycode.profiling import profile
from IPython.display import HTML
xr = randn(1000000, 100)
HTML(profile(lambda: topk_sorted_implementation(xr, 5, 1, 0),
pyinst_format='html')[1])
```

## Parallelisation¶

We need to disable the parallelisation to really compare both implementation.

```from tqdm import tqdm

def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10):
res = {}
for k in tqdm(K):
f1 = lambda x, k=k: fct1(x, k=k)
f2 = lambda x, k=k: fct2(x, k=k)
r = benchmark(X32, f1, f2, N=N, repeat=repeat, number=number)
for n, v in r.items():
res[n, k] = v
return res

K = [1, 2, 5, 10, 15]
N = [1, 2, 3, 10, 100, 1000, 10000, 100000]

bench_para = benchmark_test(
X32, (lambda x, k: topk_sorted_implementation_cpp(x, k=k, axis=1, largest=0, th_para=100000000)),
(lambda x, k: topk_sorted_implementation_cpp(x, k=k, axis=1, largest=0, th_para=1)),
N=N, K=K)
```
```100%|██████████| 5/5 [00:33<00:00,  6.67s/it]
```
```plot_metric(bench_para, transpose=False, title="TopK and parallelisation\n"
"< 1 means parallelisation is faster", shrink=0.75);
```

This is somehow expected. First column is closed to 1 as it is the same code being compared. Next columns are red, meaning the parallelisation does not help, the parallelisation helps for bigger N, as least more than 100.

## Parallellisation with ONNX¶

We replicate the same experiment with an ONNX graph.

```k_ = numpy.array([3], dtype=numpy.int64)
node = OnnxTopK_11('X', 'k',
output_names=['dist', 'ind'])

model_onnx = node.to_onnx([('X', X32), ('k', k_)], target_opset=12,
# shape inference does not seem to work in onnxruntime
# so we speccify the output shape
outputs=[('dist', X32[:1, :5]),
('ind', X32[:1, :5].astype(numpy.int64))])
```
```oinf_no_para = OnnxInference(model_onnx, runtime="python")
res = oinf_no_para.run({'X': X32, 'k': k_})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]
```
```(array([[2.4611652, 1.9387559, 1.7922987],
[3.1443286, 2.580894 , 2.249138 ]], dtype=float32),
array([[53, 16, 88],
[27, 55, 53]], dtype=int64))
```
```oinf_para = OnnxInference(model_onnx, runtime="python")
oinf_no_para.sequence_[0].ops_.th_para = 100000000
oinf_para.sequence_[0].ops_.th_para = 1
```
```bench_onnx_para = benchmark_test(
X32, (lambda x, k: oinf_no_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
(lambda x, k: oinf_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
N=N, K=K)
```
```100%|██████████| 5/5 [01:10<00:00, 14.20s/it]
```
```plot_metric(bench_onnx_para, transpose=False, title="TopK and parallelisation with ONNX\n"
"< 1 means parallelisation is faster", shrink=0.75);
```

Pretty much the same results.

## onnxruntime vs mlprodict (no parallelisation)¶

```model_onnx.ir_version = get_ir_version_from_onnx()
sess = InferenceSession(model_onnx.SerializeToString())
```
```bench_ort = benchmark_test(
X32, (lambda x, k: sess.run(None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
(lambda x, k: oinf_no_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
N=N, K=K)
```
```100%|██████████| 5/5 [01:14<00:00, 14.86s/it]
```
```plot_metric(bench_ort, transpose=False, title="TopK, onnxruntime vs mlprodict\n"
"< 1 means mlprodict is faster\nno parallelisation", shrink=0.75);
```

It seems the implement of operator TopK in onnxruntime 1.1.1 can be improved.

```from onnxruntime import __version__ as ort_version
from mlprodict import __version__ as mlp_version
ort_version, mlp_version
```
```('1.3.995', '0.4.1188')
```

And with parallelisation above 50 rows.

```oinf_para.sequence_[0].ops_.th_para = 50
bench_ort_para = benchmark_test(
X32, (lambda x, k: sess.run(None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
(lambda x, k: oinf_para.run({'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
N=N, K=K)
```
```100%|██████████| 5/5 [00:59<00:00, 11.93s/it]
```
```plot_metric(bench_ort_para, transpose=False, title="TopK, onnxruntime vs mlprodict\n"
"< 1 means mlprodict is faster\nparallelisation above 50 rows", shrink=0.75);
```

Interesting…

## Comparison with onnxruntime¶

```from skl2onnx.algebra.onnx_ops import OnnxTopK
from skl2onnx.common.data_types import FloatTensorType
from mlprodict.onnx_conv import to_onnx
from onnxruntime import InferenceSession

X = numpy.array([
[0, 1, 2, 3],
[4, 5, 6, 7],
[8, 9, 10, 11],
], dtype=numpy.float32)

K = numpy.array([3], dtype=numpy.int64)

node = OnnxTopK('X', K, output_names=['values', 'indices'],
op_version=12)
onx = node.to_onnx([('X', FloatTensorType())])

py_topk = OnnxInference(onx, runtime="python_compiled")
ort_topk = InferenceSession(onx.SerializeToString())
```
```py_topk.run({'X': X})
```
```{'values': array([[ 3.,  2.,  1.],
[ 7.,  6.,  5.],
[11., 10.,  9.]], dtype=float32),
'indices': array([[3, 2, 1],
[3, 2, 1],
[3, 2, 1]], dtype=int64)}
```
```ort_topk.run(None, {'X': X})
```
```[array([[ 3.,  2.,  1.],
[ 7.,  6.,  5.],
[11., 10.,  9.]], dtype=float32),
array([[3, 2, 1],
[3, 2, 1],
[3, 2, 1]], dtype=int64)]
```
```%timeit py_topk.run({'X': X})
```
```10.7 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
```
```%timeit ort_topk.run(None, {'X': X})
```
```14.6 µs ± 629 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
```
```X = numpy.random.randn(10000, 100).astype(numpy.float32)
```
```%timeit py_topk.run({'X': X})
```
```1.76 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
```%timeit ort_topk.run(None, {'X': X})
```
```2.25 ms ± 97.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```