TopK benchmark#

This example compares onnxruntime and mlprodict for an implementation of operator TopK. We measure two runtimes by computing a ratio between their time execution through the following kind of graphs.

Graph to compare performance#

from numpy.random import randn
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnxruntime import InferenceSession, __version__ as ort_version
from tqdm import tqdm
from cpyquickhelper.numbers import measure_time
from pyquickhelper.pycode.profiling import profile
from skl2onnx.algebra.onnx_ops import OnnxTopK_11
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxTopK
from mlprodict.onnxrt.validate.validate_benchmark import benchmark_fct
from mlprodict.onnxrt import OnnxInference
from mlprodict.onnxrt.ops_cpu.op_topk import (
    topk_sorted_implementation, topk_sorted_implementation_cpp)
from mlprodict import __version__ as mlp_version
from mlprodict.plotting.plotting import plot_benchmark_metrics

Available optimisation on this machine.

from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

Out:

AVX-omp=8

Graph.

def plot_metric(metric, ax=None, xlabel="N", ylabel="k", middle=1.,
                transpose=False, shrink=1.0, title=None):
    ax, cbar = plot_benchmark_metrics(
        metric, ax=ax, xlabel=xlabel, ylabel=ylabel, middle=middle,
        transpose=transpose, cbar_kw={'shrink': shrink})
    if title is not None:
        ax.set_title(title)
    return ax


data = {(1, 1): 0.1, (10, 1): 1, (1, 10): 2,
        (10, 10): 100, (100, 1): 100, (100, 10): 1000}

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_metric(data, ax[0], shrink=0.6)
plot op onnx topk

Out:

<AxesSubplot:xlabel='N', ylabel='k'>
plot_metric(data, ax[1], transpose=True)

Out:

<AxesSubplot:xlabel='k', ylabel='N'>

TopK in ONNX#

The following lines creates an ONNX graph using one TopK ONNX node. The outcome is the ONNX graph converted into json.

X32 = randn(100000, 100).astype(numpy.float32)

node = OnnxTopK_11('X', numpy.array([5], dtype=numpy.int64),
                   output_names=['dist', 'ind'])

model_onnx = node.to_onnx(
    [('X', X32)], target_opset=12,
    # shape inference does not seem to work in onnxruntime
    # so we speccify the output shape
    outputs=[('dist', X32[:1, :5]),
             ('ind', X32[:1, :5].astype(numpy.int64))])
model_onnx

Out:

ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.12.999"
domain: "ai.onnx"
model_version: 0
graph {
  node {
    input: "X"
    input: "To_TopKcst"
    output: "dist"
    output: "ind"
    name: "To_TopK"
    op_type: "TopK"
    domain: ""
  }
  name: "OnnxTopK_11"
  initializer {
    dims: 1
    data_type: 7
    int64_data: 5
    name: "To_TopKcst"
  }
  input {
    name: "X"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 100
          }
        }
      }
    }
  }
  output {
    name: "dist"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 5
          }
        }
      }
    }
  }
  output {
    name: "ind"
    type {
      tensor_type {
        elem_type: 7
        shape {
          dim {
          }
          dim {
            dim_value: 5
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 11
}

That gives…

oinf = OnnxInference(model_onnx, runtime="python")
res = oinf.run({'X': X32})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]

Out:

(array([[1.8322875, 1.6574656, 1.5371112, 1.533833 , 1.3560296],
       [1.9024433, 1.6438056, 1.589202 , 1.5111674, 1.5020684]],
      dtype=float32), array([[44, 67,  7, 11,  5],
       [16, 41, 14, 90, 72]]))

With onnxruntime.

sess = InferenceSession(model_onnx.SerializeToString())
dist, ind = sess.run(None, {'X': X32})
dist[:2], ind[:2]

Out:

(array([[1.8322875, 1.6574656, 1.5371112, 1.533833 , 1.3560296],
       [1.9024433, 1.6438056, 1.589202 , 1.5111674, 1.5020684]],
      dtype=float32), array([[44, 67,  7, 11,  5],
       [16, 41, 14, 90, 72]], dtype=int64))

Let’s compare two implementations.

def benchmark(X, fct1, fct2, N, repeat=10, number=10):

    def ti(n):
        if n <= 1:
            return 50
        if n <= 1000:
            return 2
        if n <= 10000:
            return 0.51
        return 0.11

    # to warm up the engine
    time_kwargs = {n: dict(repeat=10, number=10) for n in N[:2]}
    benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
    benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
    # real measure
    time_kwargs = {n: dict(repeat=int(repeat * ti(n)),
                           number=int(number * ti(n))) for n in N}
    res1 = benchmark_fct(fct1, X, time_kwargs=time_kwargs,
                         skip_long_test=False)
    res2 = benchmark_fct(fct2, X, time_kwargs=time_kwargs,
                         skip_long_test=False)

    res = {}
    for r in sorted(res1):
        r1 = res1[r]
        r2 = res2[r]
        ratio = r2['ttime'] / r1['ttime']
        res[r] = ratio
    return res


N = [1, 10, 100, 1000, 10000, 100000]
res = benchmark(X32, lambda x: sess.run(None, {'X': x}),
                lambda x: oinf.run({'X': x}), N=N)
res

Out:

{1: 1.4504362079553554, 10: 1.4345952691683033, 100: 0.6372096968373944, 1000: 0.5852394044922027, 10000: 0.6498384499381442, 100000: 0.7501577911719474}

The implementation in mlprodict is faster when the number of rows grows. It is faster for 1 rows, for many rows, the implementation uses openmp to parallelize.

C++ implementation vs numpy#

scikit-learn uses numpy to compute the top k elements.

res = benchmark(X32, lambda x: topk_sorted_implementation(x, 5, 1, 0),
                lambda x: topk_sorted_implementation_cpp(x, 5, 1, 0), N=N)
res

Out:

{1: 0.3148533175197631, 10: 0.3150096966828008, 100: 0.17426163978915785, 1000: 0.07318128126372192, 10000: 0.08684122671170744, 100000: 0.0826421648990161}

It seems to be faster too. Let’s profile.

xr = randn(1000000, 100)
text = profile(lambda: topk_sorted_implementation(xr, 5, 1, 0),
               pyinst_format='text')[1]
print(text)

Out:

  _     ._   __/__   _ _  _  _ _/_   Recorded: 03:27:27 AM Samples:  5
 /_//_/// /_\ / //_// / //_'/ //     Duration: 3.047     CPU time: 3.039
/   _/                      v4.1.1

Program: somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_onnx_topk.py

3.046 profile  ../pycode/profiling.py:457
`- 3.046 <lambda>  plot_op_onnx_topk.py:177
      [15 frames hidden]  plot_op_onnx_topk, mlprodict, <__arra...
         3.046 topk_sorted_implementation  mlprodict/onnxrt/ops_cpu/op_topk.py:18
         |  2.024 ndarray.argpartition  <built-in>:0
         |- 0.938 [self]

Parallelisation#

We need to disable the parallelisation to really compare both implementation.

# In[11]:


def benchmark_test(X, fct1, fct2, N, K, repeat=10, number=10):
    res = {}
    for k in tqdm(K):
        def f1(x, k=k): return fct1(x, k=k)
        def f2(x, k=k): return fct2(x, k=k)
        r = benchmark(X32, f1, f2, N=N, repeat=repeat, number=number)
        for n, v in r.items():
            res[n, k] = v
    return res


K = [1, 2, 5, 10, 15]
N = [1, 2, 3, 10, 100, 1000, 10000]

bench_para = benchmark_test(
    X32, (lambda x, k: topk_sorted_implementation_cpp(
        x, k=k, axis=1, largest=0, th_para=100000000)),
    (lambda x, k: topk_sorted_implementation_cpp(
        x, k=k, axis=1, largest=0, th_para=1)),
    N=N, K=K)

bench_para

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:14<00:58, 14.57s/it]
 40%|####      | 2/5 [00:29<00:43, 14.61s/it]
 60%|######    | 3/5 [00:44<00:29, 14.86s/it]
 80%|########  | 4/5 [01:00<00:15, 15.43s/it]
100%|##########| 5/5 [01:17<00:00, 16.10s/it]
100%|##########| 5/5 [01:17<00:00, 15.59s/it]

{(1, 1): 0.9970330104920017, (2, 1): 1.2062710118891071, (3, 1): 1.1879399831100979, (10, 1): 2.842107223147799, (100, 1): 0.7690324524347808, (1000, 1): 0.30015377947156013, (10000, 1): 0.32574383576740795, (1, 2): 0.9973543561106335, (2, 2): 1.1947758863077522, (3, 2): 1.1969832456090832, (10, 2): 1.116168475474925, (100, 2): 0.5648099067143836, (1000, 2): 0.22114620817037522, (10000, 2): 0.2505101777637194, (1, 5): 0.998548019704205, (2, 5): 1.1826136332187265, (3, 5): 1.208370877499602, (10, 5): 1.0126078588138006, (100, 5): 0.42024194621333827, (1000, 5): 0.18986503807532878, (10000, 5): 0.21037588577161093, (1, 10): 0.9999959770621354, (2, 10): 1.1530161093026396, (3, 10): 1.094442296587939, (10, 10): 0.8645675159090817, (100, 10): 0.30806565926434537, (1000, 10): 0.16943223177444433, (10000, 10): 0.17814027043202652, (1, 15): 0.9952256567671903, (2, 15): 1.1782382174700112, (3, 15): 1.070046254661734, (10, 15): 0.781369136828188, (100, 15): 0.27245459880816586, (1000, 15): 0.16629550379596045, (10000, 15): 0.168878837864309}

As a graph.

plot_metric(bench_para, transpose=False, title="TopK and parallelisation\n"
            "< 1 means parallelisation is faster", shrink=0.75)
TopK and parallelisation < 1 means parallelisation is faster

Out:

<AxesSubplot:title={'center':'TopK and parallelisation\n< 1 means parallelisation is faster'}, xlabel='N', ylabel='k'>

This is somehow expected. First column is closed to 1 as it is the same code being compared. Next columns are red, meaning the parallelisation does not help, the parallelisation helps for bigger N, as least more than 100.

Parallellisation with ONNX#

We replicate the same experiment with an ONNX graph.

k_ = numpy.array([3], dtype=numpy.int64)
node = OnnxTopK_11('X', 'k',
                   output_names=['dist', 'ind'])

model_onnx = node.to_onnx(
    [('X', X32), ('k', k_)], target_opset=12,
    # shape inference does not seem to work in onnxruntime
    # so we speccify the output shape
    outputs=[('dist', X32[:1, :5]),
             ('ind', X32[:1, :5].astype(numpy.int64))])

Test

oinf_no_para = OnnxInference(model_onnx, runtime="python")
res = oinf_no_para.run({'X': X32, 'k': k_})
dist, ind = res['dist'], res['ind']
dist[:2], ind[:2]

Out:

(array([[1.8322875, 1.6574656, 1.5371112],
       [1.9024433, 1.6438056, 1.589202 ]], dtype=float32), array([[44, 67,  7],
       [16, 41, 14]]))

Let’s play with the thresholds triggering the parallelisation.

oinf_para = OnnxInference(model_onnx, runtime="python")
oinf_no_para.sequence_[0].ops_.th_para = 100000000
oinf_para.sequence_[0].ops_.th_para = 1

Results.

bench_onnx_para = benchmark_test(
    X32, (lambda x, k: oinf_no_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
bench_onnx_para

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:42<02:50, 42.67s/it]
 40%|####      | 2/5 [01:25<02:08, 42.99s/it]
 60%|######    | 3/5 [02:10<01:27, 43.54s/it]
 80%|########  | 4/5 [02:55<00:44, 44.09s/it]
100%|##########| 5/5 [03:42<00:00, 45.15s/it]
100%|##########| 5/5 [03:42<00:00, 44.41s/it]

{(1, 1): 1.0078529461789785, (2, 1): 1.0656090007485628, (3, 1): 1.0617129921467094, (10, 1): 1.0430426838803268, (100, 1): 0.8950294470629733, (1000, 1): 0.42708986434297935, (10000, 1): 0.3689730851893795, (1, 2): 0.9892372910976307, (2, 2): 1.1078421982348812, (3, 2): 1.0191187098958585, (10, 2): 0.9938923537604795, (100, 2): 0.7437068177052948, (1000, 2): 0.3199603638713188, (10000, 2): 0.2876594498203759, (1, 5): 1.0072816886528608, (2, 5): 1.0339565860764446, (3, 5): 1.0631145710907237, (10, 5): 1.0080210253747133, (100, 5): 0.6004919771513039, (1000, 5): 0.2663541997193623, (10000, 5): 0.23853147337629482, (1, 10): 1.013188365017895, (2, 10): 1.0925966640604587, (3, 10): 1.1003013938287072, (10, 10): 0.9680085921676722, (100, 10): 0.4699081405137954, (1000, 10): 0.22512220723203133, (10000, 10): 0.19870530785851057, (1, 15): 0.9761970861808663, (2, 15): 11.203870801146762, (3, 15): 1.283586533201994, (10, 15): 8.93853013296494, (100, 15): 0.5848444409550148, (1000, 15): 0.2860581126703213, (10000, 15): 0.24375694857926614}

As a graph.

plot_metric(bench_onnx_para, transpose=False,
            title="TopK and parallelisation with ONNX\n< 1 means "
            "parallelisation is faster", shrink=0.75)
TopK and parallelisation with ONNX < 1 means parallelisation is faster

Out:

<AxesSubplot:title={'center':'TopK and parallelisation with ONNX\n< 1 means parallelisation is faster'}, xlabel='N', ylabel='k'>

Pretty much the same results.

onnxruntime vs mlprodict (no parallelisation)#

sess = InferenceSession(model_onnx.SerializeToString())


bench_ort = benchmark_test(
    X32, (lambda x, k: sess.run(
        None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_no_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
bench_ort

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:37<02:28, 37.13s/it]
 40%|####      | 2/5 [01:15<01:53, 37.72s/it]
 60%|######    | 3/5 [01:54<01:16, 38.28s/it]
 80%|########  | 4/5 [02:33<00:38, 38.86s/it]
100%|##########| 5/5 [03:15<00:00, 39.79s/it]
100%|##########| 5/5 [03:15<00:00, 39.08s/it]

{(1, 1): 1.3337945680692338, (2, 1): 1.3392487892727731, (3, 1): 1.3372289273955145, (10, 1): 1.3195619385450508, (100, 1): 1.1875392113188192, (1000, 1): 0.91669855462369, (10000, 1): 3.0962588103985773, (1, 2): 1.3523797929037482, (2, 2): 1.403753309157073, (3, 2): 1.4063639722881254, (10, 2): 1.3830240462105265, (100, 2): 1.191928868543275, (1000, 2): 0.9361909441387706, (10000, 2): 2.930845157399065, (1, 5): 1.3355966561704, (2, 5): 1.3750508365060101, (3, 5): 1.363631380708658, (10, 5): 1.3124725119411689, (100, 5): 1.069295617244892, (1000, 5): 2.28452897325411, (10000, 5): 3.2649505546391806, (1, 10): 1.30794777844922, (2, 10): 1.2894698058859702, (3, 10): 1.2821736002228092, (10, 10): 1.2155775976595458, (100, 10): 0.9888530559105226, (1000, 10): 2.924267009852096, (10000, 10): 3.2601748084178417, (1, 15): 1.319967267671452, (2, 15): 1.3534815825144482, (3, 15): 1.3333231986976968, (10, 15): 1.238795633903483, (100, 15): 0.971011835376701, (1000, 15): 3.097332801530916, (10000, 15): 3.1960159151758316}

As a graph.

plot_metric(bench_ort, transpose=False,
            title="TopK, onnxruntime vs mlprodict\n< 1 means mlprodict "
            "is faster\nno parallelisation", shrink=0.75)
TopK, onnxruntime vs mlprodict < 1 means mlprodict is faster no parallelisation

Out:

<AxesSubplot:title={'center':'TopK, onnxruntime vs mlprodict\n< 1 means mlprodict is faster\nno parallelisation'}, xlabel='N', ylabel='k'>

It seems the implementation of operator TopK in onnxruntime 1.1.1 can be improved.

Versions:

ort_version, mlp_version

Out:

('1.11.1', '0.8.1809')

And with parallelisation above 50 rows.

oinf_para.sequence_[0].ops_.th_para = 50
bench_ort_para = benchmark_test(
    X32, (lambda x, k: sess.run(
        None, {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    (lambda x, k: oinf_para.run(
        {'X': x, 'k': numpy.array([k], dtype=numpy.int64)})),
    N=N, K=K)
bench_ort_para

Out:

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|##        | 1/5 [00:38<02:34, 38.50s/it]
 40%|####      | 2/5 [01:18<01:57, 39.26s/it]
 60%|######    | 3/5 [01:59<01:20, 40.02s/it]
 80%|########  | 4/5 [02:41<00:41, 41.06s/it]
100%|##########| 5/5 [03:25<00:00, 41.83s/it]
100%|##########| 5/5 [03:25<00:00, 41.01s/it]

{(1, 1): 1.3447531995186726, (2, 1): 1.3882063071413895, (3, 1): 1.3852939750423987, (10, 1): 1.375810841775583, (100, 1): 9.450267096352304, (1000, 1): 5.410766250156249, (10000, 1): 6.464397284958566, (1, 2): 1.3426493957629861, (2, 2): 1.3787639149660402, (3, 2): 1.3782662916023154, (10, 2): 1.3461084064326663, (100, 2): 25.672239812849416, (1000, 2): 4.376135456407118, (10000, 2): 4.6053811238472395, (1, 5): 1.3372984262418341, (2, 5): 1.2889438839964609, (3, 5): 1.3073536290259253, (10, 5): 1.30241783950362, (100, 5): 13.039611299479063, (1000, 5): 10.265911500509551, (10000, 5): 2.3959277225063236, (1, 10): 1.3245078596414441, (2, 10): 1.2822179989884033, (3, 10): 1.2783771367770242, (10, 10): 1.206361534852372, (100, 10): 15.626297356995162, (1000, 10): 8.39229148029485, (10000, 10): 1.839896883644029, (1, 15): 1.298628437375448, (2, 15): 1.2729623688529732, (3, 15): 1.2608451637169031, (10, 15): 1.1694254530201271, (100, 15): 9.298013740539028, (1000, 15): 7.021307302505504, (10000, 15): 1.5022267290192273}

As a graph.

plot_metric(bench_ort_para, transpose=False,
            title="TopK, onnxruntime vs mlprodict\n< 1 means mlprodict "
            "is faster\nparallelisation above 50 rows", shrink=0.75)
TopK, onnxruntime vs mlprodict < 1 means mlprodict is faster parallelisation above 50 rows

Out:

<AxesSubplot:title={'center':'TopK, onnxruntime vs mlprodict\n< 1 means mlprodict is faster\nparallelisation above 50 rows'}, xlabel='N', ylabel='k'>
onnxruntime and mlprodict implement the same algorithm.

The only difference comes from the threshold which trigger the parallelisation. It should depend on the machine. That explains the difference in time for 100 observations.

X = numpy.array([
    [0, 1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10, 11],
], dtype=numpy.float32)

K = numpy.array([3], dtype=numpy.int64)


node = OnnxTopK('X', K, output_names=['values', 'indices'],
                op_version=12)
onx = node.to_onnx([('X', FloatTensorType())])

py_topk = OnnxInference(onx, runtime="python_compiled")
ort_topk = InferenceSession(onx.SerializeToString())

Check the outputs.

r1 = py_topk.run({'X': X})
r1

Out:

{'values': array([[ 3.,  2.,  1.],
       [ 7.,  6.,  5.],
       [11., 10.,  9.]], dtype=float32), 'indices': array([[3, 2, 1],
       [3, 2, 1],
       [3, 2, 1]])}
r2 = ort_topk.run(None, {'X': X})
r2

Out:

[array([[ 3.,  2.,  1.],
       [ 7.,  6.,  5.],
       [11., 10.,  9.]], dtype=float32), array([[3, 2, 1],
       [3, 2, 1],
       [3, 2, 1]], dtype=int64)]

Some figures.

bs = []
bs.append(measure_time(lambda: py_topk.run({'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'py'
bs[-1]

Out:

{'average': 5.01311019907007e-05, 'deviation': 5.653122841632005e-07, 'min_exec': 4.9664499965729195e-05, 'max_exec': 5.166525996173732e-05, 'repeat': 10, 'number': 50, 'ttime': 0.000501311019907007, 'context_size': 2272, 'c': 'py'}
bs.append(measure_time(lambda: ort_topk.run(None, {'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'or'
bs[-1]

Out:

{'average': 4.983760599861852e-05, 'deviation': 6.569137590764718e-07, 'min_exec': 4.942647996358573e-05, 'max_exec': 5.1694059948204085e-05, 'repeat': 10, 'number': 50, 'ttime': 0.0004983760599861852, 'context_size': 2272, 'c': 'or'}
X = numpy.random.randn(10000, 100).astype(numpy.float32)


bs.append(measure_time(lambda: py_topk.run({'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'py-100'
bs[-1]

Out:

{'average': 0.007777833220010508, 'deviation': 6.412324244098513e-05, 'min_exec': 0.007707859220099636, 'max_exec': 0.007882172620011261, 'repeat': 10, 'number': 50, 'ttime': 0.07777833220010508, 'context_size': 2272, 'c': 'py-100'}
bs.append(measure_time(lambda: ort_topk.run(None, {'X': X}),
                       context=globals(), div_by_number=True))
bs[-1]['c'] = 'ort-100'
bs[-1]

Out:

{'average': 0.0020234485379914983, 'deviation': 3.50715378003053e-05, 'min_exec': 0.001978598819987383, 'max_exec': 0.0020840817400312518, 'repeat': 10, 'number': 50, 'ttime': 0.020234485379914985, 'context_size': 2272, 'c': 'ort-100'}
df = DataFrame(bs)
df
average deviation min_exec max_exec repeat number ttime context_size c
0 0.000050 5.653123e-07 0.000050 0.000052 10 50 0.000501 2272 py
1 0.000050 6.569138e-07 0.000049 0.000052 10 50 0.000498 2272 or
2 0.007778 6.412324e-05 0.007708 0.007882 10 50 0.077778 2272 py-100
3 0.002023 3.507154e-05 0.001979 0.002084 10 50 0.020234 2272 ort-100


Total running time of the script: ( 13 minutes 7.389 seconds)

Gallery generated by Sphinx-Gallery