Compares implementations of ReduceMax#

This example compares the numpy for the operator ReduceMax to onnxruntime implementation. If available, tensorflow and pytorch are included as well.

Available optimisation#

The code shows which parallelisation optimisation could be used, AVX or SSE and the number of available processors.

import numpy
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxReduceMax
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

Out:

AVX-omp=8

ReduceMax implementations#

try:
    from tensorflow.math import reduce_max as tf_reduce_max
    from tensorflow import convert_to_tensor
except ImportError:
    tf_reduce_max = None
try:
    from torch import max as torch_max, from_numpy
except ImportError:
    torch_max = None


def build_ort_reducemax(axes, op_version=14):  # opset=13, 14, ...
    node = OnnxReduceMax('x', axes=axes, op_version=op_version,
                         output_names=['z'])
    onx = node.to_onnx(inputs=[('x', FloatTensorType())],
                       target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString())
    return lambda x, y: sess.run(None, {'x': x})


def loop_fct(fct, xs, ys):
    for x, y in zip(xs, ys):
        fct(x, y)


def benchmark_op(axes, repeat=2, number=5, name="ReduceMax", shape_fct=None):
    if shape_fct is None:
        def shape_fct(dim):
            return (3, dim, 1, 128, 64)
    ort_fct = build_ort_reducemax(axes)
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 1024]):
        shape = shape_fct(dim)
        n_arrays = 10 if dim < 512 else 4
        xs = [numpy.random.rand(*shape).astype(numpy.float32)
              for _ in range(n_arrays)]
        ys = [numpy.array(axes, dtype=numpy.int64)
              for _ in range(n_arrays)]
        info = dict(axes=axes, shape=shape)

        # numpy
        ctx = dict(
            xs=xs, ys=ys,
            fct=lambda x, y: numpy.amax(x, tuple(y)),
            loop_fct=loop_fct)
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs.update(info)
        res.append(obs)

        if tf_reduce_max is not None:
            # tensorflow
            ctx['fct'] = tf_reduce_max
            ctx['xs'] = [convert_to_tensor(x) for x in xs]
            ctx['ys'] = ys
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf'
            obs.update(info)
            res.append(obs)

        if torch_max is not None:
            def torch_max1(x, y):
                return torch_max(x, y[0])

            def torch_max2(x, y):
                return torch_max(torch_max(x, y[1])[0], y[0])[0]

            # torch
            ctx['fct'] = torch_max1 if len(axes) == 1 else torch_max2
            ctx['xs'] = [from_numpy(x) for x in xs]
            ctx['ys'] = ys  # [from_numpy(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'torch'
            obs.update(info)
            res.append(obs)

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    df.columns = [_.replace('dim', 'N') for _ in df.columns]
    piv = df.pivot('N', 'fct', 'average')

    rs = piv.copy()
    for c in ['ort', 'torch', 'tf', 'tf_copy']:
        if c in rs.columns:
            rs[c] = rs['numpy'] / rs[c]
    rs['numpy'] = 1.

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title="%s benchmark\n%r - %r"
                   " lower better" % (name, shape_name, axes))
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%r - %r"
                  " higher better" % (name, shape_name, axes))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return df, rs, ax


dfs = []

Reduction on a particular case KR#

Consecutive axis not reduced and consecutive reduced axis are merged. KR means kept axis - reduced axis

(8, 24, 48, N), axis=(3, )#

axes = (3, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(8, 24, 48, N)' - (3,) lower better, ReduceMax Speedup, baseline=numpy '(8, 24, 48, N)' - (3,) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:03,  3.00it/s]
 18%|#8        | 2/11 [00:00<00:02,  3.23it/s]
 27%|##7       | 3/11 [00:01<00:02,  2.89it/s]
 36%|###6      | 4/11 [00:01<00:02,  2.55it/s]
 45%|####5     | 5/11 [00:02<00:02,  2.18it/s]
 55%|#####4    | 6/11 [00:02<00:02,  1.88it/s]
 64%|######3   | 7/11 [00:03<00:02,  1.52it/s]
 73%|#######2  | 8/11 [00:04<00:02,  1.26it/s]
 82%|########1 | 9/11 [00:06<00:02,  1.03s/it]
 91%|######### | 10/11 [00:07<00:00,  1.04it/s]
100%|##########| 11/11 [00:08<00:00,  1.12s/it]
100%|##########| 11/11 [00:08<00:00,  1.28it/s]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.013467 0.018564 0.020195 0.021290 0.022134 0.024180 0.028068 0.030897 0.038848 0.018271 0.032182
ort 0.000988 0.001202 0.003444 0.003155 0.004319 0.005091 0.006808 0.008247 0.012434 0.005917 0.010996
torch 0.016763 0.005454 0.007645 0.007190 0.008212 0.008922 0.011801 0.011574 0.014952 0.010284 0.013184


Reduction on a particular case RK#

Consecutive axis not reduced and consecutive reduced axis are merged. RK means reduced axis - kept axis,

(8, 24, 48, N), axis=(0, )#

axes = (0, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(8, 24, 48, N)' - (0,) lower better, ReduceMax Speedup, baseline=numpy '(8, 24, 48, N)' - (0,) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  8.94it/s]
 18%|#8        | 2/11 [00:00<00:01,  5.88it/s]
 27%|##7       | 3/11 [00:00<00:01,  4.10it/s]
 36%|###6      | 4/11 [00:01<00:02,  2.53it/s]
 45%|####5     | 5/11 [00:02<00:03,  1.73it/s]
 55%|#####4    | 6/11 [00:03<00:03,  1.29it/s]
 64%|######3   | 7/11 [00:05<00:04,  1.10s/it]
 73%|#######2  | 8/11 [00:07<00:04,  1.46s/it]
 82%|########1 | 9/11 [00:10<00:04,  2.06s/it]
 91%|######### | 10/11 [00:12<00:01,  1.97s/it]
100%|##########| 11/11 [00:16<00:00,  2.48s/it]
100%|##########| 11/11 [00:16<00:00,  1.47s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.003934 0.007633 0.014755 0.028724 0.045291 0.058588 0.089795 0.113312 0.176044 0.089834 0.180276
ort 0.001251 0.001934 0.003607 0.007129 0.010126 0.013284 0.020807 0.026195 0.035578 0.018388 0.036919
torch 0.003924 0.007449 0.007188 0.011850 0.012304 0.015142 0.021537 0.025224 0.034843 0.024082 0.054866


Reduction on a particular case KRK#

Consecutive axis not reduced and consecutive reduced axis are merged. KRK means kept axis - reduced axis - kept axis,

(8, 24, 48, N), axis=(1, 2)#

axes = (1, 2)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(8, 24, 48, N)' - (1, 2) lower better, ReduceMax Speedup, baseline=numpy '(8, 24, 48, N)' - (1, 2) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  5.76it/s]
 18%|#8        | 2/11 [00:00<00:01,  5.11it/s]
 27%|##7       | 3/11 [00:00<00:01,  4.07it/s]
 36%|###6      | 4/11 [00:01<00:02,  2.96it/s]
 45%|####5     | 5/11 [00:01<00:02,  2.14it/s]
 55%|#####4    | 6/11 [00:02<00:02,  1.68it/s]
 64%|######3   | 7/11 [00:03<00:03,  1.25it/s]
 73%|#######2  | 8/11 [00:05<00:03,  1.11s/it]
 82%|########1 | 9/11 [00:07<00:02,  1.47s/it]
 91%|######### | 10/11 [00:09<00:01,  1.46s/it]
100%|##########| 11/11 [00:12<00:00,  1.89s/it]
100%|##########| 11/11 [00:12<00:00,  1.11s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.008631 0.010105 0.014546 0.021651 0.029825 0.036066 0.052391 0.065007 0.097390 0.049180 0.096721
ort 0.000761 0.001081 0.001728 0.003235 0.005298 0.007585 0.008874 0.032894 0.015585 0.030937 0.064102
torch 0.005473 0.005885 0.006519 0.008134 0.011694 0.011500 0.015562 0.020452 0.025394 0.016208 0.032935


(8, 24 * 48, N), axis=1#

axes = (1, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24 * 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(8, 1152, N)' - (1,) lower better, ReduceMax Speedup, baseline=numpy '(8, 1152, N)' - (1,) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  5.69it/s]
 18%|#8        | 2/11 [00:00<00:01,  5.14it/s]
 27%|##7       | 3/11 [00:00<00:02,  3.94it/s]
 36%|###6      | 4/11 [00:01<00:02,  2.90it/s]
 45%|####5     | 5/11 [00:01<00:02,  2.15it/s]
 55%|#####4    | 6/11 [00:02<00:03,  1.55it/s]
 64%|######3   | 7/11 [00:04<00:03,  1.18it/s]
 73%|#######2  | 8/11 [00:07<00:04,  1.62s/it]
 82%|########1 | 9/11 [00:09<00:03,  1.84s/it]
 91%|######### | 10/11 [00:12<00:02,  2.11s/it]
100%|##########| 11/11 [00:17<00:00,  3.15s/it]
100%|##########| 11/11 [00:17<00:00,  1.63s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.008571 0.010164 0.014432 0.021673 0.029800 0.036128 0.052375 0.065038 0.097465 0.049164 0.096720
ort 0.000760 0.001364 0.003334 0.003224 0.005316 0.007578 0.010529 0.031338 0.015733 0.029753 0.064109
torch 0.006133 0.005218 0.006995 0.008701 0.010335 0.027026 0.017265 0.174080 0.027989 0.148031 0.297641


(2, 8, 12, 24, 2, N), axis=(2, 3)#

axes = (2, 3)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (2, 8, 12, 24, 2, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(2, 8, 12, 24, 2, N)' - (2, 3) lower better, ReduceMax Speedup, baseline=numpy '(2, 8, 12, 24, 2, N)' - (2, 3) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  6.86it/s]
 18%|#8        | 2/11 [00:00<00:01,  5.52it/s]
 27%|##7       | 3/11 [00:00<00:01,  4.40it/s]
 36%|###6      | 4/11 [00:01<00:02,  3.10it/s]
 45%|####5     | 5/11 [00:01<00:02,  2.20it/s]
 55%|#####4    | 6/11 [00:02<00:02,  1.72it/s]
 64%|######3   | 7/11 [00:03<00:03,  1.26it/s]
 73%|#######2  | 8/11 [00:05<00:03,  1.05s/it]
 82%|########1 | 9/11 [00:07<00:02,  1.45s/it]
 91%|######### | 10/11 [00:09<00:01,  1.44s/it]
100%|##########| 11/11 [00:12<00:00,  1.88s/it]
100%|##########| 11/11 [00:12<00:00,  1.10s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.005508 0.007596 0.012128 0.019728 0.028354 0.034948 0.052188 0.065615 0.099805 0.050743 0.100163
ort 0.000749 0.002956 0.001660 0.003051 0.004991 0.006224 0.008397 0.013757 0.017042 0.026608 0.065592
torch 0.006249 0.006000 0.006794 0.009261 0.012406 0.012853 0.016826 0.021993 0.026247 0.017668 0.030444


Reduction on a particular case RKR#

(N, 64, 16, 16), axis=(0, 2, 3)#

axes = (0, 2, 3)
df, piv, ax = benchmark_op(
    axes, shape_fct=lambda dim: (dim, 64, 16, 16))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(N, 64, 16, 16)' - (0, 2, 3) lower better, ReduceMax Speedup, baseline=numpy '(N, 64, 16, 16)' - (0, 2, 3) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  7.58it/s]
 18%|#8        | 2/11 [00:00<00:01,  5.39it/s]
 27%|##7       | 3/11 [00:00<00:02,  3.95it/s]
 36%|###6      | 4/11 [00:01<00:02,  2.65it/s]
 45%|####5     | 5/11 [00:02<00:03,  1.83it/s]
 55%|#####4    | 6/11 [00:03<00:03,  1.40it/s]
 64%|######3   | 7/11 [00:04<00:03,  1.01it/s]
 73%|#######2  | 8/11 [00:06<00:03,  1.32s/it]
 82%|########1 | 9/11 [00:10<00:03,  1.93s/it]
 91%|######### | 10/11 [00:11<00:01,  1.87s/it]
100%|##########| 11/11 [00:15<00:00,  2.34s/it]
100%|##########| 11/11 [00:15<00:00,  1.38s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.001918 0.004126 0.006687 0.011854 0.017550 0.022094 0.032962 0.042542 0.064917 0.033292 0.065487
ort 0.001115 0.002432 0.004092 0.006675 0.009393 0.011296 0.018810 0.022801 0.035848 0.019729 0.036799
torch 0.006617 0.008903 0.009335 0.012233 0.017036 0.018798 0.025952 0.034662 0.067723 0.038821 0.073612


Reduction on a particular case RKRK#

(8, 24, 48, N), axis=(0, 2)#

axes = (0, 2)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")
ReduceMax benchmark '(8, 24, 48, N)' - (0, 2) lower better, ReduceMax Speedup, baseline=numpy '(8, 24, 48, N)' - (0, 2) higher better

Out:

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  5.77it/s]
 18%|#8        | 2/11 [00:00<00:01,  4.92it/s]
 27%|##7       | 3/11 [00:00<00:02,  3.74it/s]
 36%|###6      | 4/11 [00:01<00:02,  2.71it/s]
 45%|####5     | 5/11 [00:02<00:03,  1.96it/s]
 55%|#####4    | 6/11 [00:02<00:03,  1.54it/s]
 64%|######3   | 7/11 [00:04<00:03,  1.13it/s]
 73%|#######2  | 8/11 [00:06<00:03,  1.16s/it]
 82%|########1 | 9/11 [00:08<00:03,  1.60s/it]
 91%|######### | 10/11 [00:10<00:01,  1.59s/it]
100%|##########| 11/11 [00:15<00:00,  2.62s/it]
100%|##########| 11/11 [00:15<00:00,  1.38s/it]
N 8 16 32 64 100 128 200 256 400 512 1024
fct
numpy 0.007947 0.009897 0.014362 0.021815 0.030187 0.036589 0.053978 0.066894 0.100828 0.050622 0.099654
ort 0.001132 0.002010 0.005740 0.006788 0.010836 0.013495 0.022425 0.030080 0.041677 0.044605 0.272253
torch 0.006169 0.006491 0.006500 0.008999 0.012330 0.012550 0.015340 0.019716 0.023662 0.015408 0.033653


Conclusion#

Some of the configurations should be investigated. l-reducesum-problem1. The reduction on tensorflow in one dimension seems to be lazy.

merged = pandas.concat(dfs)
name = "reducemax"
merged.to_csv("plot_%s.csv" % name, index=False)
merged.to_excel("plot_%s.xlsx" % name, index=False)
plt.savefig("plot_%s.png" % name)

plt.show()
plot op reducemax

Total running time of the script: ( 1 minutes 50.637 seconds)

Gallery generated by Sphinx-Gallery