Compares implementations of Add#

This example compares the addition of numpy to onnxruntime implementation. Function numpy.add is repeated 3 times. This minimizes the cost of copying the data from python to an external library. If available, tensorflow and pytorch are included as well. The numpy implementation is not the best, it allocates more buffers than necessary because parameter out is not used to reuse buffers.

import numpy
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxAdd
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

Out:

AVX-omp=8

Add implementations#

try:
    from tensorflow.math import add as tf_add
    from tensorflow import convert_to_tensor
except ImportError:
    tf_add = None
try:
    from torch import add as torch_add, from_numpy
except ImportError:
    torch_add = None


def build_ort_add(op_version=12):
    node1 = OnnxAdd('x', 'y', op_version=op_version)
    node2 = OnnxAdd(node1, 'y', op_version=op_version)
    node = OnnxAdd(node2, 'y', op_version=op_version, output_names=['z'])
    onx = node.to_onnx(inputs=[('x', FloatTensorType()),
                               ('y', FloatTensorType())],
                       target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString())
    return lambda x, y: sess.run(None, {'x': x, 'y': y})


def loop_fct(fct, xs, ys):
    for x, y in zip(xs, ys):
        fct(x, y)


def benchmark_op(repeat=5, number=2, name="Add", shape_fcts=None):
    if shape_fcts is None:
        def shape_fct(dim):
            return (5, dim, dim)
        shape_fcts = (shape_fct, shape_fct)
    ort_fct = build_ort_add()
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 1024, 1536, 2048, 2560]):
        shape1 = shape_fcts[0](dim)
        shape2 = shape_fcts[1](dim)
        n_arrays = (16 if dim < 512 else 4) if dim < 2048 else 4
        if len(shape1) > 3:
            n_arrays = int(n_arrays / 4)
        xs = [numpy.random.rand(*shape1).astype(numpy.float32)
              for _ in range(n_arrays)]
        ys = [numpy.random.rand(*shape2).astype(numpy.float32)
              for _ in range(n_arrays)]
        info = dict(shape1=shape1, shape2=shape2)

        # numpy
        ctx = dict(
            xs=xs, ys=ys,
            fct=lambda x, y: numpy.add(numpy.add(numpy.add(x, y), y), y),
            loop_fct=loop_fct)
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs.update(info)
        res.append(obs)

        if tf_add is not None:
            # tensorflow
            ctx['fct'] = lambda x, y: tf_add(tf_add(tf_add(x, y), y), y)
            ctx['xs'] = [convert_to_tensor(x) for x in xs]
            ctx['ys'] = [convert_to_tensor(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf'
            obs.update(info)
            res.append(obs)

        if torch_add is not None:
            # torch
            ctx['fct'] = lambda x, y: torch_add(
                torch_add(torch_add(x, y), y), y)
            ctx['xs'] = [from_numpy(x) for x in xs]
            ctx['ys'] = [from_numpy(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'torch'
            obs.update(info)
            res.append(obs)

    # Dataframes
    shape1_name = str(shape1).replace(str(dim), "N")
    shape2_name = str(shape2).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    df.columns = [_.replace('dim', 'N') for _ in df.columns]
    piv = df.pivot('N', 'fct', 'average')

    rs = piv.copy()
    for c in ['ort', 'torch', 'tf']:
        if c in rs.columns:
            rs[c] = rs['numpy'] / rs[c]
    rs['numpy'] = 1.

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title="%s benchmark\n%s + %s"
                   " lower better" % (name, shape1_name, shape2_name))
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%s + %s"
                  " higher better" % (name, shape1_name, shape2_name))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return df, rs, ax


dfs = []

(5, N, N) + (5, N, N)#

df, piv, ax = benchmark_op()
dfs.append(df)
df.pivot("fct", "N", "average")
Add benchmark (5, N, N) + (5, N, N) lower better, Add Speedup, baseline=numpy (5, N, N) + (5, N, N) higher better

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
  7%|7         | 1/14 [00:00<00:02,  6.06it/s]
 29%|##8       | 4/14 [00:00<00:00, 13.76it/s]
 43%|####2     | 6/14 [00:00<00:01,  7.14it/s]
 50%|#####     | 7/14 [00:01<00:01,  3.93it/s]
 57%|#####7    | 8/14 [00:02<00:02,  2.20it/s]
 64%|######4   | 9/14 [00:04<00:04,  1.02it/s]
 71%|#######1  | 10/14 [00:05<00:03,  1.01it/s]
 79%|#######8  | 11/14 [00:09<00:05,  1.81s/it]
 86%|########5 | 12/14 [00:18<00:07,  3.74s/it]
 93%|#########2| 13/14 [00:34<00:07,  7.36s/it]
100%|##########| 14/14 [00:58<00:00, 12.20s/it]
100%|##########| 14/14 [00:58<00:00,  4.17s/it]
N 8 16 32 64 100 128 200 256 400 512 1024 1536 2048 2560
fct
numpy 0.000267 0.000338 0.000564 0.001423 0.002678 0.007259 0.020023 0.033135 0.076218 0.032142 0.126711 0.281942 0.562169 0.802069
ort 0.000920 0.001019 0.001225 0.002194 0.005417 0.010167 0.014224 0.024930 0.056474 0.023849 0.091059 0.191718 0.346620 0.531688
torch 0.013377 0.000827 0.001337 0.003308 0.004292 0.004564 0.011526 0.020006 0.046386 0.020035 0.071442 0.147367 0.299730 0.391453


(5, N, N) + (5, N, 1)#

shape_fcts = (lambda dim: (5, dim, dim),
              lambda dim: (5, dim, 1))

df, piv, ax = benchmark_op(shape_fcts=shape_fcts)
dfs.append(df)
df.pivot("fct", "N", "average")
Add benchmark (5, N, N) + (5, N, 1) lower better, Add Speedup, baseline=numpy (5, N, N) + (5, N, 1) higher better

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
 21%|##1       | 3/14 [00:00<00:00, 27.69it/s]
 43%|####2     | 6/14 [00:00<00:00,  8.55it/s]
 57%|#####7    | 8/14 [00:01<00:01,  3.35it/s]
 64%|######4   | 9/14 [00:03<00:03,  1.63it/s]
 71%|#######1  | 10/14 [00:04<00:02,  1.53it/s]
 79%|#######8  | 11/14 [00:07<00:03,  1.23s/it]
 86%|########5 | 12/14 [00:13<00:05,  2.55s/it]
 93%|#########2| 13/14 [00:26<00:05,  5.21s/it]
100%|##########| 14/14 [00:48<00:00,  9.82s/it]
100%|##########| 14/14 [00:48<00:00,  3.43s/it]
N 8 16 32 64 100 128 200 256 400 512 1024 1536 2048 2560
fct
numpy 0.000679 0.000784 0.001028 0.002359 0.003709 0.007896 0.019710 0.029816 0.070191 0.030208 0.119649 0.244929 0.466762 0.819825
ort 0.001056 0.001183 0.001525 0.002663 0.005986 0.009728 0.012172 0.019097 0.047169 0.018809 0.073877 0.161103 0.298888 0.447637
torch 0.000850 0.000991 0.001539 0.003489 0.004501 0.004659 0.009864 0.012045 0.035437 0.017164 0.055939 0.114856 0.264561 0.556731


(5, N, N) + (5, 1, N)#

shape_fcts = (lambda dim: (5, dim, dim),
              lambda dim: (5, 1, dim))

df, piv, ax = benchmark_op(shape_fcts=shape_fcts)
dfs.append(df)
df.pivot("fct", "N", "average")
Add benchmark (5, N, N) + (5, 1, N) lower better, Add Speedup, baseline=numpy (5, N, N) + (5, 1, N) higher better

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
 21%|##1       | 3/14 [00:00<00:00, 26.85it/s]
 43%|####2     | 6/14 [00:00<00:00,  8.67it/s]
 57%|#####7    | 8/14 [00:01<00:01,  3.40it/s]
 64%|######4   | 9/14 [00:03<00:03,  1.65it/s]
 71%|#######1  | 10/14 [00:04<00:02,  1.55it/s]
 79%|#######8  | 11/14 [00:07<00:03,  1.25s/it]
 86%|########5 | 12/14 [00:13<00:05,  2.57s/it]
 93%|#########2| 13/14 [00:27<00:05,  5.55s/it]
100%|##########| 14/14 [00:47<00:00,  9.69s/it]
100%|##########| 14/14 [00:47<00:00,  3.43s/it]
N 8 16 32 64 100 128 200 256 400 512 1024 1536 2048 2560
fct
numpy 0.000693 0.000797 0.001082 0.001879 0.003870 0.006086 0.018613 0.029117 0.067375 0.027049 0.122258 0.243730 0.475513 0.666302
ort 0.001086 0.001219 0.001556 0.002706 0.006059 0.009942 0.012372 0.019146 0.047837 0.020070 0.079950 0.162697 0.293459 0.457919
torch 0.000876 0.001035 0.001619 0.003866 0.004634 0.004863 0.010034 0.011696 0.034866 0.017657 0.057578 0.113741 0.369843 0.558821


(5, N, 5, N) + (1, N, 1, 1)#

shape_fcts = (lambda dim: (5, dim, 5, dim),
              lambda dim: (1, dim, 1, 1))

df, piv, ax = benchmark_op(shape_fcts=shape_fcts)
dfs.append(df)
df.pivot("fct", "N", "average")
Add benchmark (5, N, 5, N) + (1, N, 1, 1) lower better, Add Speedup, baseline=numpy (5, N, 5, N) + (1, N, 1, 1) higher better

Out:

  0%|          | 0/14 [00:00<?, ?it/s]
 29%|##8       | 4/14 [00:00<00:00, 24.09it/s]
 50%|#####     | 7/14 [00:01<00:01,  4.95it/s]
 64%|######4   | 9/14 [00:04<00:03,  1.52it/s]
 71%|#######1  | 10/14 [00:05<00:02,  1.41it/s]
 79%|#######8  | 11/14 [00:08<00:04,  1.34s/it]
 86%|########5 | 12/14 [00:20<00:07,  3.78s/it]
 93%|#########2| 13/14 [00:41<00:07,  7.99s/it]
100%|##########| 14/14 [01:14<00:00, 14.55s/it]
100%|##########| 14/14 [01:14<00:00,  5.30s/it]
N 8 16 32 64 100 128 200 256 400 512 1024 1536 2048 2560
fct
numpy 0.000205 0.000277 0.000549 0.002259 0.005684 0.009097 0.020812 0.033515 0.081772 0.034990 0.132870 0.480585 0.749898 1.116248
ort 0.000288 0.000356 0.000660 0.001709 0.003590 0.006171 0.014282 0.023927 0.057011 0.022528 0.099226 0.280794 0.515249 0.929336
torch 0.000273 0.000418 0.001009 0.006171 0.006287 0.007488 0.014779 0.021221 0.043613 0.019068 0.063868 0.253706 0.425147 0.667463


Conclusion#

It is difficult to have a final conclusion as the addition of two vectors is of the same order of magnitude of a copy between python and the C++ code of onnxruntime, pytorch or tensorflow. numpy is much better of small vectors. onnxruntime, pytorch and tensorflow are not optimized on this case because it is not very common in deep learning.

merged = pandas.concat(dfs)
name = "add"
merged.to_csv("plot_%s.csv" % name, index=False)
merged.to_excel("plot_%s.xlsx" % name, index=False)
plt.savefig("plot_%s.png" % name)

plt.show()
plot op add

Total running time of the script: ( 4 minutes 2.478 seconds)

Gallery generated by Sphinx-Gallery