Compares mul implementations

numpy has a very fast implementation of matrix multiplication. There are many ways to be slower.

import pprint
import numpy
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp
from td3a_cpp.tools import measure_time_dim

dfs = []
sets = list(range(2, 145, 20))

numpy mul

ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
             vb=numpy.random.randn(n, n).astype(numpy.float64),
             mul=lambda x, y: x @ y,
             x_name=n)
        for n in sets]

res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
dfs.append(DataFrame(res))
dfs[-1]['fct'] = 'numpy'
pprint.pprint(dfs[-1].tail(n=2))

Out:

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|#####     | 4/8 [00:00<00:00, 22.66it/s]
 88%|########7 | 7/8 [00:00<00:00, 14.12it/s]
100%|##########| 8/8 [00:00<00:00, 12.36it/s]
    average  deviation  min_exec  max_exec  ...  number  context_size  x_name    fct
6  0.000279   0.000083  0.000249  0.000529  ...      50           232     122  numpy
7  0.000364   0.000006  0.000361  0.000380  ...      50           232     142  numpy

[2 rows x 9 columns]

Simple multiplication

ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
             vb=numpy.random.randn(n, n).astype(numpy.float64),
             mul=dmul_cython_omp,
             x_name=n)
        for n in sets]

res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
pprint.pprint(dfs[-1].tail(n=2))

Out:

  0%|          | 0/8 [00:00<?, ?it/s]
 38%|###7      | 3/8 [00:00<00:00, 25.90it/s]
 75%|#######5  | 6/8 [00:02<00:00,  2.55it/s]
100%|##########| 8/8 [00:06<00:00,  1.13s/it]
100%|##########| 8/8 [00:06<00:00,  1.16it/s]
    average  deviation  min_exec  max_exec  ...  number  context_size  x_name    fct
6  0.000279   0.000083  0.000249  0.000529  ...      50           232     122  numpy
7  0.000364   0.000006  0.000361  0.000380  ...      50           232     142  numpy

[2 rows x 9 columns]

Other scenarios

3 differents algorithms, each of them parallelized. See dmul_cython_omp.

for algo in range(0, 2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d" % (algo, parallel))
        ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
                     vb=numpy.random.randn(n, n).astype(numpy.float64),
                     mul=lambda x, y: dmul_cython_omp(
                        x, y, algo=algo, parallel=parallel),
                     x_name=n)
                for n in sets]

        res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
        dfs.append(DataFrame(res))
        dfs[-1]['fct'] = 'a=%d-p=%d' % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))

Out:

algo=0 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 38%|###7      | 3/8 [00:00<00:00, 24.70it/s]
 75%|#######5  | 6/8 [00:02<00:00,  2.53it/s]
100%|##########| 8/8 [00:06<00:00,  1.13s/it]
100%|##########| 8/8 [00:06<00:00,  1.15it/s]
    average  deviation  min_exec  ...  context_size  x_name      fct
6  0.003823   0.000032  0.003779  ...           232     122  a=0-p=0
7  0.005984   0.000029  0.005925  ...           232     142  a=0-p=0

[2 rows x 9 columns]
algo=0 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|#####     | 4/8 [00:00<00:00, 29.47it/s]
 88%|########7 | 7/8 [00:01<00:00,  5.94it/s]
100%|##########| 8/8 [00:01<00:00,  4.68it/s]
    average  deviation  min_exec  ...  context_size  x_name      fct
6  0.000911   0.000014  0.000904  ...           232     122  a=0-p=1
7  0.001378   0.000010  0.001366  ...           232     142  a=0-p=1

[2 rows x 9 columns]
algo=1 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 38%|###7      | 3/8 [00:00<00:00, 24.06it/s]
 75%|#######5  | 6/8 [00:02<00:00,  2.38it/s]
100%|##########| 8/8 [00:07<00:00,  1.21s/it]
100%|##########| 8/8 [00:07<00:00,  1.08it/s]
    average  deviation  min_exec  ...  context_size  x_name      fct
6  0.004029   0.000001  0.004027  ...           232     122  a=1-p=0
7  0.006461   0.000003  0.006458  ...           232     142  a=1-p=0

[2 rows x 9 columns]
algo=1 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|#####     | 4/8 [00:00<00:00, 29.09it/s]
 88%|########7 | 7/8 [00:01<00:00,  5.80it/s]
100%|##########| 8/8 [00:01<00:00,  4.57it/s]
    average  deviation  min_exec  ...  context_size  x_name      fct
6  0.000928   0.000008  0.000921  ...           232     122  a=1-p=1
7  0.001418   0.000011  0.001407  ...           232     142  a=1-p=1

[2 rows x 9 columns]

One left issue

Will you find it in dmul_cython_omp.

va = numpy.random.randn(3, 4).astype(numpy.float64)
vb = numpy.random.randn(4, 5).astype(numpy.float64)
numpy_mul = va @ vb

try:
    for a in range(0, 50):
        wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
        assert_almost_equal(numpy_mul, wrong_mul)
        print("Iteration %d is Ok" % a)
    print("All iterations are unexpectedly Ok. Don't push your luck.")
except AssertionError as e:
    print(e)

Out:

Arrays are not almost equal to 7 decimals

Mismatched elements: 5 / 15 (33.3%)
Max absolute difference: 1.06269797
Max relative difference: 9.37717394
 x: array([[ 0.2502086, -0.430583 ,  2.7545071,  0.4333792, -2.8196375],
       [ 1.2871892,  0.0921121, -1.1760261,  1.1166405, -1.8620959],
       [-1.5047251, -0.4175358,  1.6848687,  0.0786706, -1.0260055]])
 y: array([[ 0.5306858, -0.3772123,  2.7545071,  0.4333792, -2.8196375],
       [ 1.2871892,  0.0921121, -0.1133282,  0.6908399, -1.8620959],
       [-1.5047251, -0.4175358,  1.6848687,  0.0786706, -0.9581005]])

Other scenarios but transposed

Same differents algorithms but the second matrix is transposed first: b_trans=1.

for algo in range(0, 2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d transposed" % (algo, parallel))
        ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
                     vb=numpy.random.randn(n, n).astype(numpy.float64),
                     mul=lambda x, y: dmul_cython_omp(
                        x, y, algo=algo, parallel=parallel, b_trans=1),
                     x_name=n)
                for n in sets]

        res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
        dfs.append(DataFrame(res))
        dfs[-1]['fct'] = 'a=%d-p=%d-T' % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))

Out:

algo=0 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|#####     | 4/8 [00:00<00:00, 14.62it/s]
 75%|#######5  | 6/8 [00:01<00:00,  3.62it/s]
 88%|########7 | 7/8 [00:02<00:00,  2.02it/s]
100%|##########| 8/8 [00:04<00:00,  1.18it/s]
100%|##########| 8/8 [00:04<00:00,  1.76it/s]
    average     deviation  min_exec  ...  context_size  x_name        fct
6  0.002463  8.650679e-07  0.002462  ...           232     122  a=0-p=0-T
7  0.003823  2.085923e-06  0.003821  ...           232     142  a=0-p=0-T

[2 rows x 9 columns]
algo=0 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|######2   | 5/8 [00:00<00:00, 31.12it/s]
100%|##########| 8/8 [00:00<00:00, 10.78it/s]
    average  deviation  min_exec  ...  context_size  x_name        fct
6  0.000375   0.000004  0.000374  ...           232     122  a=0-p=1-T
7  0.000550   0.000005  0.000547  ...           232     142  a=0-p=1-T

[2 rows x 9 columns]
algo=1 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|#####     | 4/8 [00:00<00:00, 14.66it/s]
 75%|#######5  | 6/8 [00:01<00:00,  3.67it/s]
 88%|########7 | 7/8 [00:02<00:00,  2.05it/s]
100%|##########| 8/8 [00:04<00:00,  1.20it/s]
100%|##########| 8/8 [00:04<00:00,  1.78it/s]
    average  deviation  min_exec  ...  context_size  x_name        fct
6  0.002429   0.000015  0.002420  ...           232     122  a=1-p=0-T
7  0.003759   0.000002  0.003757  ...           232     142  a=1-p=0-T

[2 rows x 9 columns]
algo=1 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|######2   | 5/8 [00:00<00:00, 31.16it/s]
100%|##########| 8/8 [00:00<00:00, 10.70it/s]
    average     deviation  min_exec  ...  context_size  x_name        fct
6  0.000372  8.834384e-07  0.000371  ...           232     122  a=1-p=1-T
7  0.000562  6.782326e-05  0.000538  ...           232     142  a=1-p=1-T

[2 rows x 9 columns]

Let’s display the results

cc = concat(dfs)
cc['N'] = cc['x_name']

fig, ax = plt.subplots(3, 2, figsize=(10, 8))
cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot(
    logy=True, logx=True, ax=ax[0, 0])
cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1])
cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot(
    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0])
cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1])
cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot(
    logy=True, logx=True, ax=ax[2, 1])
fig.suptitle("Comparison of multiplication implementations")
Comparison of multiplication implementations

Out:

Text(0.5, 0.98, 'Comparison of multiplication implementations')

The results depends on the machine, its number of cores, the compilation settings of numpy or this module.

plt.show()

Total running time of the script: ( 0 minutes 41.415 seconds)

Gallery generated by Sphinx-Gallery