Compares implementations of Tranpose#

This example compares the numpy.transpose from numpy, to onnxruntime implementation. If available, tensorflow and pytorch are included as well.

Available optimisation #

The code shows which parallelisation optimisation could be used, AVX or SSE and the number of available processors. Both numpy and torch have lazy implementations, the function switches dimensions and strides but does not move any data. That’s why function contiguous was called in both cases.

import numpy
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxTranspose
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())

AVX-omp=8

Transpose implementations #

Function einsum is used from tensorflow and pytorch instead of transpose. The equation reflects the required transposition.

try:
    from tensorflow import transpose as tf_transpose, convert_to_tensor
except ImportError:
    tf_transpose = None
try:
    from torch import einsum as torch_einsum, from_numpy
except ImportError:
    torch_einsum = None


def build_ort_transpose(perm, op_version=12):
    node = OnnxTranspose('x', perm=perm, op_version=op_version,
                         output_names=['z'])
    onx = node.to_onnx(inputs=[('x', FloatTensorType())],
                       target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString())
    return lambda x, y: sess.run(None, {'x': x})


def loop_fct(fct, xs, ys):
    for x, y in zip(xs, ys):
        fct(x, y)


def perm2eq(perm):
    first = "".join(chr(97 + i) for i in range(len(perm)))
    second = "".join(first[p] for p in perm)
    return f"{first}->{second}"


def benchmark_op(perm, repeat=5, number=5, name="Transpose", shape_fct=None):
    if shape_fct is None:
        def shape_fct(dim): return (3, dim, 1, 512)
    ort_fct = build_ort_transpose(perm)
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 1024]):
        shape = shape_fct(dim)
        n_arrays = 10 if dim < 512 else 4
        xs = [numpy.random.rand(*shape).astype(numpy.float32)
              for _ in range(n_arrays)]
        ys = [perm for _ in range(n_arrays)]
        equation = perm2eq(perm)
        info = dict(perm=perm, shape=shape)

        # numpy
        ctx = dict(
            xs=xs, ys=ys,
            fct=lambda x, y: numpy.ascontiguousarray(numpy.transpose(x, y)),
            loop_fct=loop_fct)
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            "loop_fct(fct, xs, ys)",
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs.update(info)
        res.append(obs)

        if tf_transpose is not None:
            # tensorflow
            ctx['fct'] = tf_transpose
            ctx['xs'] = [convert_to_tensor(x) for x in xs]
            ctx['ys'] = [convert_to_tensor(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf'
            obs.update(info)
            res.append(obs)

            # tensorflow with copy
            ctx['fct'] = lambda x, y: tf_transpose(
                convert_to_tensor(x)).numpy()
            ctx['xs'] = xs
            ctx['ys'] = ys
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'tf_copy'
            obs.update(info)
            res.append(obs)

        if torch_einsum is not None:
            # torch
            ctx['fct'] = lambda x, y: torch_einsum(equation, x).contiguous()
            ctx['xs'] = [from_numpy(x) for x in xs]
            ctx['ys'] = ys  # [from_numpy(y) for y in ys]
            obs = measure_time(
                "loop_fct(fct, xs, ys)",
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'torch'
            obs.update(info)
            res.append(obs)

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    df.columns = [_.replace('dim', 'N') for _ in df.columns]
    piv = df.pivot('N', 'fct', 'average')

    rs = piv.copy()
    for c in ['ort', 'torch', 'tf', 'tf_copy']:
        if c in rs.columns:
            rs[c] = rs['numpy'] / rs[c]
    rs['numpy'] = 1.

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title="%s benchmark\n%r - %r - %s"
                   " lower better" % (name, shape_name, perm, equation))
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%r - %r - %s"
                  " higher better" % (name, shape_name, perm, equation))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return df, rs, ax


dfs = []

First permutation: (1, 0, 2, 3)#

perm = (1, 0, 2, 3)
df, piv, ax = benchmark_op(perm)
dfs.append(df)
df.pivot("fct", "N", "average")

Transpose benchmark '(3, N, 1, 512)' - (1, 0, 2, 3) - abcd->bacd lower better, Transpose Speedup, baseline=numpy '(3, N, 1, 512)' - (1, 0, 2, 3) - abcd->bacd higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:05,  1.99it/s]
 27%|##7       | 3/11 [00:01<00:03,  2.50it/s]
 36%|###6      | 4/11 [00:02<00:05,  1.20it/s]
 45%|####5     | 5/11 [00:04<00:06,  1.03s/it]
 55%|#####4    | 6/11 [00:05<00:04,  1.05it/s]
 64%|######3   | 7/11 [00:06<00:04,  1.25s/it]
 73%|#######2  | 8/11 [00:09<00:04,  1.57s/it]
 82%|########1 | 9/11 [00:11<00:03,  1.89s/it]
 91%|######### | 10/11 [00:12<00:01,  1.65s/it]
100%|##########| 11/11 [00:14<00:00,  1.63s/it]
100%|##########| 11/11 [00:14<00:00,  1.32s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:185: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.000270	0.000427	0.000752	0.001646	0.002840	0.003820	0.005900	0.007484	0.011398	0.004826	0.009602
ort	0.000654	0.000777	0.001351	0.003674	0.005989	0.007006	0.010284	0.012907	0.018684	0.009353	0.017637
torch	0.018918	0.000820	0.023991	0.059321	0.046368	0.018995	0.056055	0.067662	0.068277	0.026017	0.030346

Second permutation: (0, 1, 3, 2)#

perm = (1, 0, 3, 2)
df, piv, ax = benchmark_op(perm)
dfs.append(df)
df.pivot("fct", "N", "average")

Transpose benchmark '(3, N, 1, 512)' - (1, 0, 3, 2) - abcd->badc lower better, Transpose Speedup, baseline=numpy '(3, N, 1, 512)' - (1, 0, 3, 2) - abcd->badc higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  9.51it/s]
 18%|#8        | 2/11 [00:00<00:01,  6.69it/s]
 27%|##7       | 3/11 [00:01<00:04,  1.67it/s]
 36%|###6      | 4/11 [00:03<00:07,  1.02s/it]
 45%|####5     | 5/11 [00:05<00:08,  1.40s/it]
 55%|#####4    | 6/11 [00:07<00:09,  1.80s/it]
 64%|######3   | 7/11 [00:11<00:09,  2.42s/it]
 73%|#######2  | 8/11 [00:15<00:09,  3.00s/it]
 82%|########1 | 9/11 [00:20<00:07,  3.70s/it]
 91%|######### | 10/11 [00:23<00:03,  3.33s/it]
100%|##########| 11/11 [00:27<00:00,  3.70s/it]
100%|##########| 11/11 [00:27<00:00,  2.54s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:194: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.000253	0.000419	0.000744	0.001626	0.003144	0.004036	0.005874	0.006452	0.010431	0.005021	0.010341
ort	0.003036	0.005623	0.011075	0.022903	0.035898	0.044989	0.068597	0.087956	0.135886	0.069268	0.137805
torch	0.000692	0.000800	0.033032	0.041222	0.042228	0.052062	0.070198	0.071510	0.057273	0.022277	0.027157

Third permutation: (0, 2, 1, 3)#

This transposition is equivalent to a reshape because it only moves the empty axis. The comparison is entirely fair as the cost for onnxruntime includes a copy from numpy to onnxruntime, a reshape = another copy, than a copy back to numpy. Tensorflow and pytorch seems to have a lazy implementation in this case.

perm = (0, 2, 1, 3)
df, piv, ax = benchmark_op(perm)
dfs.append(df)
df.pivot("fct", "N", "average")

Transpose benchmark '(3, N, 1, 512)' - (0, 2, 1, 3) - abcd->acbd lower better, Transpose Speedup, baseline=numpy '(3, N, 1, 512)' - (0, 2, 1, 3) - abcd->acbd higher better

  0%|          | 0/11 [00:00<?, ?it/s]
 27%|##7       | 3/11 [00:00<00:00, 24.24it/s]
 55%|#####4    | 6/11 [00:00<00:00,  7.92it/s]
 73%|#######2  | 8/11 [00:01<00:00,  5.14it/s]
 82%|########1 | 9/11 [00:01<00:00,  3.85it/s]
 91%|######### | 10/11 [00:02<00:00,  3.86it/s]
100%|##########| 11/11 [00:02<00:00,  3.18it/s]
100%|##########| 11/11 [00:02<00:00,  4.28it/s]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:211: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.000083	0.000080	0.000082	0.000082	0.000081	0.000079	0.000081	0.000080	0.000081	0.000035	0.000034
ort	0.000542	0.000677	0.001170	0.003483	0.005882	0.006710	0.007881	0.009400	0.013910	0.006885	0.013150
torch	0.000395	0.000395	0.000394	0.000395	0.000396	0.000395	0.000394	0.000396	0.000395	0.000162	0.000162

Fourth permutation: (3, 1, 2, 0)#

perm = (3, 1, 2, 0)
df, piv, ax = benchmark_op(perm)
dfs.append(df)
df.pivot("fct", "N", "average")

Transpose benchmark '(3, N, 1, 512)' - (3, 1, 2, 0) - abcd->dbca lower better, Transpose Speedup, baseline=numpy '(3, N, 1, 512)' - (3, 1, 2, 0) - abcd->dbca higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:00<00:01,  5.48it/s]
 18%|#8        | 2/11 [00:00<00:02,  3.77it/s]
 27%|##7       | 3/11 [00:02<00:07,  1.03it/s]
 36%|###6      | 4/11 [00:04<00:10,  1.44s/it]
 45%|####5     | 5/11 [00:07<00:12,  2.07s/it]
 55%|#####4    | 6/11 [00:11<00:12,  2.58s/it]
 64%|######3   | 7/11 [00:26<00:26,  6.72s/it]
 73%|#######2  | 8/11 [00:46<00:32, 10.95s/it]
 82%|########1 | 9/11 [01:16<00:33, 16.80s/it]
 91%|######### | 10/11 [01:32<00:16, 16.74s/it]
100%|##########| 11/11 [02:03<00:00, 21.15s/it]
100%|##########| 11/11 [02:03<00:00, 11.26s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:220: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.001104	0.002178	0.004373	0.008796	0.015089	0.019270	0.135477	0.197293	0.286950	0.201747	0.358163
ort	0.004385	0.008273	0.016386	0.033634	0.054205	0.067792	0.379390	0.491831	0.774584	0.395651	0.788894
torch	0.001583	0.002092	0.051394	0.042398	0.057071	0.053089	0.092216	0.107220	0.119269	0.063229	0.092599

Fifth permutation: (1, 2, 3, 0)#

perm = (1, 2, 3, 0)
df, piv, ax = benchmark_op(perm)
dfs.append(df)
df.pivot("fct", "N", "average")

Transpose benchmark '(3, N, 1, 512)' - (1, 2, 3, 0) - abcd->bcda lower better, Transpose Speedup, baseline=numpy '(3, N, 1, 512)' - (1, 2, 3, 0) - abcd->bcda higher better

  0%|          | 0/11 [00:00<?, ?it/s]
 18%|#8        | 2/11 [00:00<00:00, 11.01it/s]
 36%|###6      | 4/11 [00:03<00:06,  1.13it/s]
 45%|####5     | 5/11 [00:04<00:05,  1.01it/s]
 55%|#####4    | 6/11 [00:06<00:06,  1.24s/it]
 64%|######3   | 7/11 [00:08<00:06,  1.55s/it]
 73%|#######2  | 8/11 [00:11<00:05,  1.87s/it]
 82%|########1 | 9/11 [00:13<00:04,  2.17s/it]
 91%|######### | 10/11 [00:15<00:01,  1.96s/it]
100%|##########| 11/11 [00:17<00:00,  2.04s/it]
100%|##########| 11/11 [00:17<00:00,  1.60s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:229: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.001064	0.001642	0.003144	0.006230	0.009963	0.012573	0.019217	0.024321	0.037557	0.019125	0.038080
ort	0.000673	0.000895	0.001538	0.003983	0.006495	0.007289	0.010115	0.013041	0.018295	0.009686	0.018245
torch	0.001003	0.001404	0.064502	0.034310	0.031539	0.050916	0.059021	0.063511	0.052987	0.026676	0.026590

Six th permutation: (1, 2, 4, 3, 0)#

perm = (1, 2, 4, 3, 0)
df, piv, ax = benchmark_op(perm, shape_fct=lambda dim: (3, dim, 1, 8, 512))
dfs.append(df)
df.pivot("fct", "N", "average")

Transpose benchmark '(3, N, 1, 8, 512)' - (1, 2, 4, 3, 0) - abcde->bceda lower better, Transpose Speedup, baseline=numpy '(3, N, 1, 8, 512)' - (1, 2, 4, 3, 0) - abcde->bceda higher better

  0%|          | 0/11 [00:00<?, ?it/s]
  9%|9         | 1/11 [00:02<00:28,  2.80s/it]
 18%|#8        | 2/11 [00:06<00:30,  3.38s/it]
 27%|##7       | 3/11 [00:12<00:36,  4.56s/it]
 36%|###6      | 4/11 [00:22<00:47,  6.75s/it]
 45%|####5     | 5/11 [00:37<00:58,  9.75s/it]
 55%|#####4    | 6/11 [00:56<01:04, 12.84s/it]
 64%|######3   | 7/11 [01:24<01:11, 17.92s/it]
 73%|#######2  | 8/11 [02:00<01:10, 23.64s/it]
 82%|########1 | 9/11 [02:55<01:06, 33.48s/it]
 91%|######### | 10/11 [03:24<00:31, 31.82s/it]
100%|##########| 11/11 [04:19<00:00, 39.03s/it]
100%|##########| 11/11 [04:19<00:00, 23.58s/it]
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  piv = df.pivot('N', 'fct', 'average')
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_op_transpose.py:238: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  df.pivot("fct", "N", "average")

N	8	16	32	64	100	128	200	256	400	512	1024
fct
numpy	0.008064	0.016178	0.031716	0.062755	0.097224	0.124551	0.193305	0.248180	0.385573	0.198296	0.395463
ort	0.033566	0.066472	0.131140	0.259683	0.407118	0.516620	0.809991	1.034288	1.616078	0.829505	1.657548
torch	0.069457	0.066635	0.071837	0.074116	0.086293	0.097169	0.107384	0.122609	0.155789	0.072083	0.113549

Conclusion #

All libraries have similar implementations. onnxruntime measures includes 2 mores copies, one to copy from numpy container to onnxruntime container, another one to copy back from onnxruntime container to numpy. Parallelisation should be investigated.

merged = pandas.concat(dfs)
name = "transpose"
merged.to_csv(f"plot_{name}.csv", index=False)
merged.to_excel(f"plot_{name}.xlsx", index=False)
plt.savefig(f"plot_{name}.png")

plt.show()

Total running time of the script: ( 7 minutes 38.602 seconds)

Gallery generated by Sphinx-Gallery