Note

Click here to download the full example code

Compares implementations of ReduceSum#

This example compares the numpy.sum from numpy, to onnxruntime implementation. If available, tensorflow and pytorch are included as well.

Available optimisation #

The code shows which parallelisation optimisation could be used, AVX or SSE and the number of available processors.

import numpy
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxReduceSumApi11
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import (
    code_optimisation, custom_reducesum_rk_float)
print(code_optimisation())

AVX-omp=8

ReduceSum implementations #

try:
    from tensorflow.math import reduce_sum as tf_reduce_sum
    from tensorflow import convert_to_tensor
except ImportError:
    tf_reduce_sum = None
try:
    from torch import sum as torch_sum, from_numpy
except ImportError:
    torch_sum = None


def build_ort_reducesum(axes, op_version=14):  # opset=13, 14, ...
    node = OnnxReduceSumApi11('x', axes=axes, op_version=op_version,
                              output_names=['z'])
    onx = node.to_onnx(inputs=[('x', FloatTensorType())],
                       target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString())
    return lambda x, y: sess.run(None, {'x': x})


def loop_fct(fct, xs, ys):
    for x, y in zip(xs, ys):
        fct(x, y)


def benchmark_op(axes, repeat=5, number=5, name="ReduceSum", shape_fct=None,
                 custom_impl=False):
    if shape_fct is None:
        def shape_fct(dim):
            return (3, dim, 1, 128, 64)
    ort_fct = build_ort_reducesum(axes)
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 1024]):
        shape = shape_fct(dim)
        n_arrays = 10 if dim < 512 else 4
        xs = [numpy.random.rand(*shape).astype(numpy.float32)
              for _ in range(n_arrays)]
        ys = [numpy.array(axes, dtype=numpy.int64)
              for _ in range(n_arrays)]
        info = dict(axes=axes, shape=shape)

        # numpy
        ctx = dict(
            xs=xs, ys=ys,
            fct=lambda x, y: numpy.sum(x, *y),
            loop_fct=loop_fct)
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs.update(info)
        res.append(obs)

        if custom_impl:
            if axes != (0, ):
                raise RuntimeError(
                    f"Unexpected axes={axes!r}.")
            ctx['fct'] = lambda x, y: custom_reducesum_rk_float(x)
            ctx['xs'] = [x.reshape((x.shape[0], -1)).copy() for x in xs]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'custom'
            obs.update(info)
            res.append(obs)

        if tf_reduce_sum is not None:
            # tensorflow
            ctx['fct'] = tf_reduce_sum
            ctx['xs'] = [convert_to_tensor(x) for x in xs]
            ctx['ys'] = ys
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf'
            obs.update(info)
            res.append(obs)

        if torch_sum is not None:
            def torch_sum1(x, y):
                return torch_sum(x, y[0])

            def torch_sum2(x, y):
                return torch_sum(torch_sum(x, y[1]), y[0])

            # torch
            ctx['fct'] = torch_sum1 if len(axes) == 1 else torch_sum2
            ctx['xs'] = [from_numpy(x) for x in xs]
            ctx['ys'] = ys  # [from_numpy(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'torch'
            obs.update(info)
            res.append(obs)

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    df.columns = [_.replace('dim', 'N') for _ in df.columns]
    piv = df.pivot('N', 'fct', 'average')

    rs = piv.copy()
    for c in ['ort', 'torch', 'tf', 'tf_copy']:
        if c in rs.columns:
            rs[c] = rs['numpy'] / rs[c]
    rs['numpy'] = 1.

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title=f"{name} benchmark\n{shape_name!r} - {axes!r} lower better")
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%r - %r"
                  " higher better" % (name, shape_name, axes))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return df, rs, ax


dfs = []

Reduction on a particular case KR #

Consecutive axis not reduced and consecutive reduced axis are merged. KR means kept axis - reduced axis

(8, 24, 48, N), axis=(3, )#

axes = (3, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")

ReduceSum benchmark '(8, 24, 48, N)' - (3,) lower better, ReduceSum Speedup, baseline=numpy '(8, 24, 48, N)' - (3,) higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:13,  1.37s/it]
 18%|#8        | 2/11 [00:03<00:13,  1.55s/it]
 27%|##7       | 3/11 [00:04<00:13,  1.66s/it]
 36%|###6      | 4/11 [00:06<00:11,  1.68s/it]
 45%|####5     | 5/11 [00:08<00:10,  1.71s/it]
 55%|#####4    | 6/11 [00:10<00:09,  1.97s/it]
 64%|######3   | 7/11 [00:13<00:09,  2.28s/it]
 73%|#######2  | 8/11 [00:17<00:07,  2.62s/it]
 82%|########1 | 9/11 [00:21<00:06,  3.04s/it]
 91%|######### | 10/11 [00:22<00:02,  2.66s/it]
100%|##########| 11/11 [00:25<00:00,  2.77s/it]
100%|##########| 11/11 [00:25<00:00,  2.35s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:151: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:189: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.004108	0.004851	0.006410	0.007902	0.010157	0.011906	0.017698	0.021819	0.031517	0.015750	0.031232
ort	0.001496	0.001562	0.002094	0.003191	0.004674	0.005394	0.007278	0.009411	0.011706	0.006069	0.011118
torch	0.048272	0.059100	0.060178	0.051081	0.046577	0.069638	0.073717	0.079963	0.079712	0.031400	0.041889

Reduction on a particular case RK #

Consecutive axis not reduced and consecutive reduced axis are merged. RK means reduced axis - kept axis

(8, 24, 48, N), axis=(0, )#

axes = (0, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim),
                           custom_impl=True)
dfs.append(df)
df.pivot("fct", "N", "average")

ReduceSum benchmark '(8, 24, 48, N)' - (0,) lower better, ReduceSum Speedup, baseline=numpy '(8, 24, 48, N)' - (0,) higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:19,  2.00s/it]
 18%|#8        | 2/11 [00:03<00:15,  1.77s/it]
 27%|##7       | 3/11 [00:06<00:19,  2.45s/it]
 36%|###6      | 4/11 [00:11<00:23,  3.35s/it]
 45%|####5     | 5/11 [00:17<00:24,  4.11s/it]
 55%|#####4    | 6/11 [00:23<00:24,  4.86s/it]
 64%|######3   | 7/11 [00:29<00:20,  5.13s/it]
 73%|#######2  | 8/11 [00:36<00:17,  5.71s/it]
 82%|########1 | 9/11 [00:44<00:13,  6.51s/it]
 91%|######### | 10/11 [00:48<00:05,  5.68s/it]
100%|##########| 11/11 [00:53<00:00,  5.67s/it]
100%|##########| 11/11 [00:53<00:00,  4.89s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:151: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:206: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
custom	0.025423	0.025525	0.060544	0.054723	0.044040	0.064949	0.062435	0.066504	0.069788	0.029936	0.036309
numpy	0.001122	0.002217	0.003698	0.006776	0.010306	0.017296	0.027376	0.035357	0.054845	0.030619	0.054749
ort	0.001106	0.001787	0.003138	0.006356	0.009423	0.012132	0.016790	0.020318	0.030337	0.016819	0.030457
torch	0.051417	0.032783	0.060105	0.114873	0.144508	0.145739	0.102461	0.132251	0.137654	0.055817	0.066265

Reduction on a particular case KRK #

Consecutive axis not reduced and consecutive reduced axis are merged. KRK means kept axis - reduced axis - kept axis,

(8, 24, 48, N), axis=(1, 2)#

axes = (1, 2)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")

ReduceSum benchmark '(8, 24, 48, N)' - (1, 2) lower better, ReduceSum Speedup, baseline=numpy '(8, 24, 48, N)' - (1, 2) higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:07,  1.36it/s]
 18%|#8        | 2/11 [00:02<00:11,  1.32s/it]
 27%|##7       | 3/11 [00:04<00:13,  1.69s/it]
 36%|###6      | 4/11 [00:06<00:13,  1.90s/it]
 45%|####5     | 5/11 [00:09<00:12,  2.10s/it]
 55%|#####4    | 6/11 [00:12<00:12,  2.41s/it]
 64%|######3   | 7/11 [00:18<00:14,  3.60s/it]
 73%|#######2  | 8/11 [00:25<00:14,  4.80s/it]
 82%|########1 | 9/11 [00:35<00:12,  6.24s/it]
 91%|######### | 10/11 [00:39<00:05,  5.73s/it]
100%|##########| 11/11 [00:47<00:00,  6.32s/it]
100%|##########| 11/11 [00:47<00:00,  4.31s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:151: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:222: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.002650	0.004995	0.009114	0.018171	0.028635	0.035755	0.056285	0.071005	0.110573	0.056279	0.128533
ort	0.005669	0.001289	0.002282	0.004219	0.006878	0.008088	0.012652	0.015264	0.024189	0.012552	0.027233
torch	0.020311	0.061180	0.070951	0.060121	0.053398	0.065635	0.154997	0.184678	0.205907	0.095580	0.114066

(8, 24 * 48, N), axis=1 #

axes = (1, )
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24 * 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")

ReduceSum benchmark '(8, 1152, N)' - (1,) lower better, ReduceSum Speedup, baseline=numpy '(8, 1152, N)' - (1,) higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:09,  1.04it/s]
 18%|#8        | 2/11 [00:01<00:05,  1.52it/s]
 27%|##7       | 3/11 [00:03<00:10,  1.30s/it]
 36%|###6      | 4/11 [00:04<00:09,  1.39s/it]
 45%|####5     | 5/11 [00:07<00:10,  1.68s/it]
 55%|#####4    | 6/11 [00:09<00:09,  1.98s/it]
 64%|######3   | 7/11 [00:12<00:09,  2.33s/it]
 73%|#######2  | 8/11 [00:16<00:08,  2.75s/it]
 82%|########1 | 9/11 [00:20<00:06,  3.30s/it]
 91%|######### | 10/11 [00:23<00:03,  3.01s/it]
100%|##########| 11/11 [00:27<00:00,  3.33s/it]
100%|##########| 11/11 [00:27<00:00,  2.49s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:151: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:231: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.004254	0.004919	0.006705	0.009982	0.011278	0.013374	0.017590	0.021376	0.031766	0.015937	0.030009
ort	0.005970	0.001283	0.002337	0.004279	0.006523	0.008211	0.012584	0.015075	0.023856	0.012413	0.027142
torch	0.027301	0.009960	0.070464	0.040462	0.060979	0.068766	0.073370	0.086237	0.089010	0.047131	0.068266

(2, 8, 12, 24, 2, N), axis=(2, 3)#

axes = (2, 3)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (2, 8, 12, 24, 2, dim))
dfs.append(df)
df.pivot("fct", "N", "average")

ReduceSum benchmark '(2, 8, 12, 24, 2, N)' - (2, 3) lower better, ReduceSum Speedup, baseline=numpy '(2, 8, 12, 24, 2, N)' - (2, 3) higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:10,  1.04s/it]
 18%|#8        | 2/11 [00:01<00:08,  1.11it/s]
 27%|##7       | 3/11 [00:03<00:11,  1.43s/it]
 36%|###6      | 4/11 [00:06<00:13,  1.88s/it]
 45%|####5     | 5/11 [00:13<00:21,  3.56s/it]
 55%|#####4    | 6/11 [00:19<00:23,  4.68s/it]
 64%|######3   | 7/11 [00:27<00:22,  5.68s/it]
 73%|#######2  | 8/11 [00:36<00:19,  6.55s/it]
 82%|########1 | 9/11 [00:45<00:14,  7.49s/it]
 91%|######### | 10/11 [00:49<00:06,  6.52s/it]
100%|##########| 11/11 [00:57<00:00,  6.86s/it]
100%|##########| 11/11 [00:57<00:00,  5.23s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:151: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:240: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.002786	0.005112	0.009414	0.019250	0.029668	0.036974	0.057420	0.073561	0.113452	0.058290	0.124900
ort	0.001672	0.001371	0.002411	0.004568	0.007558	0.008682	0.012172	0.015954	0.022385	0.014245	0.036048
torch	0.036366	0.023836	0.067237	0.073253	0.214941	0.216737	0.222536	0.223638	0.210876	0.082061	0.107904

Reduction on a particular case RKRK #

(8, 24, 48, N), axis=(0, 2)#

axes = (0, 2)
df, piv, ax = benchmark_op(axes, shape_fct=lambda dim: (8, 24, 48, dim))
dfs.append(df)
df.pivot("fct", "N", "average")

ReduceSum benchmark '(8, 24, 48, N)' - (0, 2) lower better, ReduceSum Speedup, baseline=numpy '(8, 24, 48, N)' - (0, 2) higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:01<00:10,  1.05s/it]
 18%|#8        | 2/11 [00:02<00:13,  1.48s/it]
 27%|##7       | 3/11 [00:05<00:14,  1.80s/it]
 36%|###6      | 4/11 [00:07<00:15,  2.17s/it]
 45%|####5     | 5/11 [00:10<00:14,  2.46s/it]
 55%|#####4    | 6/11 [00:14<00:14,  2.86s/it]
 64%|######3   | 7/11 [00:22<00:17,  4.46s/it]
 73%|#######2  | 8/11 [00:29<00:16,  5.51s/it]
 82%|########1 | 9/11 [00:40<00:14,  7.17s/it]
 91%|######### | 10/11 [00:46<00:06,  6.66s/it]
100%|##########| 11/11 [00:59<00:00,  8.66s/it]
100%|##########| 11/11 [00:59<00:00,  5.40s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:151: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_reducesum.py:252: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.002737	0.005142	0.009669	0.021690	0.034115	0.043409	0.068213	0.083322	0.133460	0.067885	0.134256
ort	0.001075	0.001807	0.003684	0.006777	0.008837	0.013446	0.018525	0.025275	0.036985	0.043171	0.246788
torch	0.037370	0.062487	0.070774	0.074768	0.067579	0.076031	0.205576	0.178567	0.225860	0.091197	0.110022

Conclusion #

Some of the configurations should be investigated. l-reducesum-problem1. The reduction on tensorflow in one dimension seems to be lazy.

merged = pandas.concat(dfs)
name = "reducesum"
merged.to_csv(f"plot_{name}.csv", index=False)
merged.to_excel(f"plot_{name}.xlsx", index=False)
plt.savefig(f"plot_{name}.png")

plt.show()

Total running time of the script: ( 4 minutes 43.395 seconds)

Gallery generated by Sphinx-Gallery