Note
Go to the end to download the full example code
Compares matrix multiplication implementations¶
numpy has a very fast implementation of matrix multiplication. There are many ways to be slower.
Compared implementations:
import pprint
import numpy
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp
from td3a_cpp.tools import measure_time_dim
dfs = []
sets = list(range(2, 145, 20))
numpy mul¶
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
vb=numpy.random.randn(n, n).astype(numpy.float64),
mul=lambda x, y: x @ y,
x_name=n)
for n in sets]
res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
dfs.append(DataFrame(res))
dfs[-1]['fct'] = 'numpy'
pprint.pprint(dfs[-1].tail(n=2))
0%| | 0/8 [00:00<?, ?it/s]
50%|##### | 4/8 [00:00<00:00, 16.40it/s]
75%|#######5 | 6/8 [00:00<00:00, 13.80it/s]
100%|##########| 8/8 [00:00<00:00, 8.24it/s]
100%|##########| 8/8 [00:00<00:00, 9.65it/s]
average deviation min_exec max_exec ... number context_size x_name fct
6 0.000330 0.000001 0.000329 0.000333 ... 50 232 122 numpy
7 0.000478 0.000002 0.000477 0.000484 ... 50 232 142 numpy
[2 rows x 9 columns]
Simple multiplication¶
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
vb=numpy.random.randn(n, n).astype(numpy.float64),
mul=dmul_cython_omp,
x_name=n)
for n in sets]
res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
pprint.pprint(res[-1])
0%| | 0/8 [00:00<?, ?it/s]
38%|###7 | 3/8 [00:00<00:00, 19.71it/s]
62%|######2 | 5/8 [00:01<00:00, 3.33it/s]
75%|#######5 | 6/8 [00:02<00:01, 1.66it/s]
88%|########7 | 7/8 [00:05<00:01, 1.13s/it]
100%|##########| 8/8 [00:09<00:00, 1.95s/it]
100%|##########| 8/8 [00:09<00:00, 1.17s/it]
{'average': 0.008107981523498892,
'context_size': 232,
'deviation': 2.1798378263439107e-06,
'max_exec': 0.008112866496667266,
'min_exec': 0.008105753352865577,
'number': 50,
'repeat': 10,
'x_name': 142}
Other scenarios¶
3 differents algorithms, each of them parallelized.
See dmul_cython_omp
.
for algo in range(0, 2):
for parallel in (0, 1):
print("algo=%d parallel=%d" % (algo, parallel))
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
vb=numpy.random.randn(n, n).astype(numpy.float64),
mul=lambda x, y: dmul_cython_omp(
x, y, algo=algo, parallel=parallel),
x_name=n)
for n in sets]
res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
dfs.append(DataFrame(res))
dfs[-1]['fct'] = 'a=%d-p=%d' % (algo, parallel)
pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0
0%| | 0/8 [00:00<?, ?it/s]
38%|###7 | 3/8 [00:00<00:00, 18.89it/s]
62%|######2 | 5/8 [00:01<00:00, 3.30it/s]
75%|#######5 | 6/8 [00:02<00:01, 1.66it/s]
88%|########7 | 7/8 [00:05<00:01, 1.12s/it]
100%|##########| 8/8 [00:09<00:00, 1.93s/it]
100%|##########| 8/8 [00:09<00:00, 1.17s/it]
average deviation min_exec ... context_size x_name fct
6 0.005033 0.000002 0.005031 ... 232 122 a=0-p=0
7 0.008089 0.000003 0.008085 ... 232 142 a=0-p=0
[2 rows x 9 columns]
algo=0 parallel=1
0%| | 0/8 [00:00<?, ?it/s]
50%|##### | 4/8 [00:00<00:00, 21.80it/s]
88%|########7 | 7/8 [00:01<00:00, 4.26it/s]
100%|##########| 8/8 [00:02<00:00, 3.41it/s]
average deviation min_exec ... context_size x_name fct
6 0.001276 0.000086 0.001243 ... 232 122 a=0-p=1
7 0.001861 0.000002 0.001859 ... 232 142 a=0-p=1
[2 rows x 9 columns]
algo=1 parallel=0
0%| | 0/8 [00:00<?, ?it/s]
38%|###7 | 3/8 [00:00<00:00, 17.53it/s]
62%|######2 | 5/8 [00:01<00:00, 3.04it/s]
75%|#######5 | 6/8 [00:02<00:01, 1.53it/s]
88%|########7 | 7/8 [00:05<00:01, 1.22s/it]
100%|##########| 8/8 [00:10<00:00, 2.11s/it]
100%|##########| 8/8 [00:10<00:00, 1.27s/it]
average deviation min_exec ... context_size x_name fct
6 0.005519 0.000001 0.005517 ... 232 122 a=1-p=0
7 0.008855 0.000003 0.008853 ... 232 142 a=1-p=0
[2 rows x 9 columns]
algo=1 parallel=1
0%| | 0/8 [00:00<?, ?it/s]
50%|##### | 4/8 [00:00<00:00, 20.76it/s]
88%|########7 | 7/8 [00:01<00:00, 4.36it/s]
100%|##########| 8/8 [00:02<00:00, 3.44it/s]
average deviation min_exec ... context_size x_name fct
6 0.001246 0.000008 0.00124 ... 232 122 a=1-p=1
7 0.001878 0.000005 0.00187 ... 232 142 a=1-p=1
[2 rows x 9 columns]
One left issue¶
Will you find it in dmul_cython_omp
.
va = numpy.random.randn(3, 4).astype(numpy.float64)
vb = numpy.random.randn(4, 5).astype(numpy.float64)
numpy_mul = va @ vb
try:
for a in range(0, 50):
wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
assert_almost_equal(numpy_mul, wrong_mul)
print("Iteration %d is Ok" % a)
print("All iterations are unexpectedly Ok. Don't push your luck.")
except AssertionError as e:
print(e)
Arrays are not almost equal to 7 decimals
Mismatched elements: 3 / 15 (20%)
Max absolute difference: 0.98678316
Max relative difference: 2.10085769e-16
x: array([[-2.1521276, -1.4024366, -0.8470817, 1.8995522, -0.9867832],
[-0.1073382, 1.9952894, -0.1321154, 1.6005042, 0.0930345],
[-1.10652 , -2.1936239, -0.5114178, 0.0623881, -0.3139902]])
y: array([[-2.1521276, -1.4024366, -0.8470817, 1.8995522, 0. ],
[-0.1073382, 1.9952894, -0.1321154, 1.6005042, 0. ],
[-1.10652 , -2.1936239, -0.5114178, 0.0623881, 0. ]])
Other scenarios but transposed¶
Same differents algorithms but the second matrix
is transposed first: b_trans=1
.
for algo in range(0, 2):
for parallel in (0, 1):
print("algo=%d parallel=%d transposed" % (algo, parallel))
ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
vb=numpy.random.randn(n, n).astype(numpy.float64),
mul=lambda x, y: dmul_cython_omp(
x, y, algo=algo, parallel=parallel, b_trans=1),
x_name=n)
for n in sets]
res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
dfs.append(DataFrame(res))
dfs[-1]['fct'] = 'a=%d-p=%d-T' % (algo, parallel)
pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0 transposed
0%| | 0/8 [00:00<?, ?it/s]
38%|###7 | 3/8 [00:00<00:00, 25.04it/s]
75%|#######5 | 6/8 [00:01<00:00, 2.69it/s]
100%|##########| 8/8 [00:06<00:00, 1.02s/it]
100%|##########| 8/8 [00:06<00:00, 1.28it/s]
average deviation min_exec ... context_size x_name fct
6 0.003414 0.000003 0.003411 ... 232 122 a=0-p=0-T
7 0.005236 0.000003 0.005234 ... 232 142 a=0-p=0-T
[2 rows x 9 columns]
algo=0 parallel=1 transposed
0%| | 0/8 [00:00<?, ?it/s]
50%|##### | 4/8 [00:00<00:00, 33.63it/s]
100%|##########| 8/8 [00:01<00:00, 7.00it/s]
100%|##########| 8/8 [00:01<00:00, 7.94it/s]
average deviation min_exec ... context_size x_name fct
6 0.000528 0.000058 0.000504 ... 232 122 a=0-p=1-T
7 0.000737 0.000003 0.000735 ... 232 142 a=0-p=1-T
[2 rows x 9 columns]
algo=1 parallel=0 transposed
0%| | 0/8 [00:00<?, ?it/s]
38%|###7 | 3/8 [00:00<00:00, 25.02it/s]
75%|#######5 | 6/8 [00:01<00:00, 2.74it/s]
100%|##########| 8/8 [00:06<00:00, 1.01it/s]
100%|##########| 8/8 [00:06<00:00, 1.30it/s]
average deviation min_exec ... context_size x_name fct
6 0.003318 0.000001 0.003316 ... 232 122 a=1-p=0-T
7 0.005143 0.000001 0.005142 ... 232 142 a=1-p=0-T
[2 rows x 9 columns]
algo=1 parallel=1 transposed
0%| | 0/8 [00:00<?, ?it/s]
50%|##### | 4/8 [00:00<00:00, 33.17it/s]
100%|##########| 8/8 [00:00<00:00, 7.15it/s]
100%|##########| 8/8 [00:00<00:00, 8.10it/s]
average deviation min_exec ... context_size x_name fct
6 0.000502 0.000002 0.000501 ... 232 122 a=1-p=1-T
7 0.000726 0.000003 0.000723 ... 232 142 a=1-p=1-T
[2 rows x 9 columns]
Let’s display the results¶
cc = concat(dfs)
cc['N'] = cc['x_name']
fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True)
ccnp = cc.fct == 'numpy'
cct = cc.fct.str.contains('-T')
cca0 = cc.fct.str.contains('a=0')
cc[ccnp | (~cct & cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 0])
cc[ccnp | (~cct & ~cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1])
cc[ccnp | (cct & cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0])
cc[ccnp | (~cct & ~cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1])
cc[ccnp | cca0].pivot(index='N', columns='fct', values='average').plot(
logy=True, logx=True, ax=ax[2, 0])
cc[ccnp | ~cca0].pivot(index='N', columns='fct', values='average').plot(
logy=True, logx=True, ax=ax[2, 1])
fig.suptitle("Comparison of matrix multiplication implementations")
Traceback (most recent call last):
File "somewheretd3a_cpp_39_std/td3a_cpp/examples/plot_benchmark_dot_mul.py", line 144, in <module>
cc[ccnp | (~cct & cca0)].pivot(
TypeError: pivot() takes 1 positional argument but 4 were given
The results depends on the machine, its number of cores, the compilation settings of numpy or this module.
plt.show()
Total running time of the script: ( 0 minutes 51.066 seconds)