Note
Go to the end to download the full example code
Compares dot implementations (numpy, c++, sse, openmp)¶
numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from td3a_cpp.tutorial.dot_cython import (
ddot_array_16_sse, ddot_array
)
from td3a_cpp.tutorial.dot_cython_omp import (
ddot_cython_array_omp,
ddot_array_openmp,
get_omp_max_threads,
ddot_array_openmp_16
)
from td3a_cpp.tools import measure_time_dim
def get_vectors(fct, n, h=250, dtype=numpy.float64):
ctxs = [dict(va=numpy.random.randn(n).astype(dtype),
vb=numpy.random.randn(n).astype(dtype),
dot=fct,
x_name=n)
for n in range(10, n, h)]
return ctxs
Number of threads¶
print(get_omp_max_threads())
8
Several cython dot¶
def numpy_dot(va, vb):
return numpy.dot(va, vb)
def ddot_omp(va, vb):
return ddot_cython_array_omp(va, vb)
def ddot_omp_static(va, vb):
return ddot_cython_array_omp(va, vb, schedule=1)
def ddot_omp_dyn(va, vb):
return ddot_cython_array_omp(va, vb, schedule=2)
def ddot_omp_cpp(va, vb):
return ddot_array_openmp(va, vb)
def ddot_omp_cpp_16(va, vb):
return ddot_array_openmp_16(va, vb)
dfs = []
for fct in [numpy_dot,
ddot_array,
ddot_array_16_sse,
ddot_omp,
ddot_omp_static,
ddot_omp_dyn,
ddot_omp_cpp,
ddot_omp_cpp_16]:
ctxs = get_vectors(fct, 40000)
print(fct.__name__)
df = DataFrame(list(measure_time_dim('dot(va, vb)', ctxs, verbose=1)))
df['fct'] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
numpy_dot
0%| | 0/160 [00:00<?, ?it/s]
9%|9 | 15/160 [00:00<00:01, 139.88it/s]
18%|#8 | 29/160 [00:00<00:01, 116.55it/s]
26%|##5 | 41/160 [00:00<00:01, 98.21it/s]
32%|###2 | 52/160 [00:00<00:01, 89.24it/s]
39%|###8 | 62/160 [00:00<00:01, 83.60it/s]
44%|####4 | 71/160 [00:00<00:01, 79.25it/s]
49%|####9 | 79/160 [00:00<00:01, 76.13it/s]
54%|#####4 | 87/160 [00:01<00:00, 73.73it/s]
59%|#####9 | 95/160 [00:01<00:00, 71.21it/s]
64%|######4 | 103/160 [00:01<00:00, 65.50it/s]
69%|######8 | 110/160 [00:01<00:00, 66.20it/s]
74%|#######3 | 118/160 [00:01<00:00, 64.51it/s]
78%|#######8 | 125/160 [00:01<00:00, 56.49it/s]
82%|########1 | 131/160 [00:01<00:00, 54.07it/s]
86%|########5 | 137/160 [00:01<00:00, 50.37it/s]
89%|########9 | 143/160 [00:02<00:00, 51.96it/s]
93%|#########3| 149/160 [00:02<00:00, 51.18it/s]
97%|#########6| 155/160 [00:02<00:00, 52.41it/s]
100%|##########| 160/160 [00:02<00:00, 66.88it/s]
average deviation min_exec ... context_size x_name fct
157 0.000034 3.487521e-07 0.000033 ... 232 39260 numpy_dot
158 0.000033 4.206640e-07 0.000033 ... 232 39510 numpy_dot
159 0.000029 4.578539e-06 0.000024 ... 232 39760 numpy_dot
[3 rows x 9 columns]
ddot_array
0%| | 0/160 [00:00<?, ?it/s]
11%|# | 17/160 [00:00<00:00, 169.14it/s]
21%|##1 | 34/160 [00:00<00:01, 112.61it/s]
29%|##9 | 47/160 [00:00<00:01, 87.53it/s]
36%|###5 | 57/160 [00:00<00:01, 73.42it/s]
41%|#### | 65/160 [00:00<00:01, 63.94it/s]
45%|####5 | 72/160 [00:01<00:01, 56.78it/s]
49%|####8 | 78/160 [00:01<00:01, 51.15it/s]
52%|#####2 | 84/160 [00:01<00:01, 46.50it/s]
56%|#####5 | 89/160 [00:01<00:01, 42.98it/s]
59%|#####8 | 94/160 [00:01<00:01, 39.59it/s]
61%|######1 | 98/160 [00:01<00:01, 37.41it/s]
64%|######3 | 102/160 [00:01<00:01, 35.21it/s]
66%|######6 | 106/160 [00:02<00:01, 33.44it/s]
69%|######8 | 110/160 [00:02<00:01, 31.68it/s]
71%|#######1 | 114/160 [00:02<00:01, 30.28it/s]
74%|#######3 | 118/160 [00:02<00:01, 28.89it/s]
76%|#######5 | 121/160 [00:02<00:01, 28.03it/s]
78%|#######7 | 124/160 [00:02<00:01, 27.05it/s]
79%|#######9 | 127/160 [00:02<00:01, 26.31it/s]
81%|########1 | 130/160 [00:02<00:01, 25.41it/s]
83%|########3 | 133/160 [00:03<00:01, 24.47it/s]
85%|########5 | 136/160 [00:03<00:01, 23.77it/s]
87%|########6 | 139/160 [00:03<00:00, 23.17it/s]
89%|########8 | 142/160 [00:03<00:00, 22.38it/s]
91%|######### | 145/160 [00:03<00:00, 21.81it/s]
92%|#########2| 148/160 [00:03<00:00, 21.24it/s]
94%|#########4| 151/160 [00:03<00:00, 20.82it/s]
96%|#########6| 154/160 [00:04<00:00, 20.28it/s]
98%|#########8| 157/160 [00:04<00:00, 19.93it/s]
99%|#########9| 159/160 [00:04<00:00, 19.79it/s]
100%|##########| 160/160 [00:04<00:00, 35.95it/s]
average deviation min_exec ... context_size x_name fct
157 0.000103 5.276492e-07 0.000102 ... 232 39260 ddot_array
158 0.000098 4.303988e-07 0.000098 ... 232 39510 ddot_array
159 0.000112 5.036949e-07 0.000111 ... 232 39760 ddot_array
[3 rows x 9 columns]
ddot_array_16_sse
0%| | 0/160 [00:00<?, ?it/s]
12%|#1 | 19/160 [00:00<00:00, 188.75it/s]
24%|##3 | 38/160 [00:00<00:00, 129.93it/s]
33%|###3 | 53/160 [00:00<00:01, 101.77it/s]
41%|#### | 65/160 [00:00<00:01, 85.07it/s]
47%|####6 | 75/160 [00:00<00:01, 73.99it/s]
52%|#####1 | 83/160 [00:01<00:01, 66.04it/s]
56%|#####6 | 90/160 [00:01<00:01, 59.86it/s]
61%|###### | 97/160 [00:01<00:01, 54.32it/s]
64%|######4 | 103/160 [00:01<00:01, 50.10it/s]
68%|######8 | 109/160 [00:01<00:01, 46.48it/s]
71%|#######1 | 114/160 [00:01<00:01, 43.38it/s]
74%|#######4 | 119/160 [00:01<00:01, 40.82it/s]
78%|#######7 | 124/160 [00:02<00:00, 38.36it/s]
80%|######## | 128/160 [00:02<00:00, 36.67it/s]
82%|########2 | 132/160 [00:02<00:00, 34.85it/s]
85%|########5 | 136/160 [00:02<00:00, 33.53it/s]
88%|########7 | 140/160 [00:02<00:00, 32.03it/s]
90%|######### | 144/160 [00:02<00:00, 30.89it/s]
92%|#########2| 148/160 [00:02<00:00, 29.86it/s]
94%|#########4| 151/160 [00:03<00:00, 29.03it/s]
96%|#########6| 154/160 [00:03<00:00, 28.04it/s]
98%|#########8| 157/160 [00:03<00:00, 27.53it/s]
100%|##########| 160/160 [00:03<00:00, 26.77it/s]
100%|##########| 160/160 [00:03<00:00, 47.62it/s]
average deviation min_exec ... context_size x_name fct
157 0.000079 3.905469e-07 0.000078 ... 232 39260 ddot_array_16_sse
158 0.000075 8.012449e-07 0.000074 ... 232 39510 ddot_array_16_sse
159 0.000079 5.563830e-07 0.000078 ... 232 39760 ddot_array_16_sse
[3 rows x 9 columns]
ddot_omp
0%| | 0/160 [00:00<?, ?it/s]
7%|6 | 11/160 [00:00<00:01, 101.43it/s]
14%|#3 | 22/160 [00:00<00:03, 40.15it/s]
18%|#7 | 28/160 [00:00<00:04, 32.44it/s]
22%|##2 | 36/160 [00:00<00:03, 41.19it/s]
28%|##8 | 45/160 [00:00<00:02, 51.38it/s]
32%|###2 | 52/160 [00:01<00:01, 55.63it/s]
37%|###6 | 59/160 [00:01<00:01, 50.88it/s]
42%|####1 | 67/160 [00:01<00:01, 57.64it/s]
46%|####6 | 74/160 [00:01<00:01, 48.11it/s]
50%|##### | 80/160 [00:01<00:01, 49.41it/s]
54%|#####4 | 87/160 [00:01<00:01, 50.00it/s]
58%|#####8 | 93/160 [00:02<00:02, 28.49it/s]
63%|######3 | 101/160 [00:02<00:01, 36.11it/s]
67%|######6 | 107/160 [00:02<00:01, 35.55it/s]
71%|####### | 113/160 [00:02<00:01, 38.83it/s]
74%|#######3 | 118/160 [00:02<00:01, 37.52it/s]
78%|#######8 | 125/160 [00:02<00:00, 43.69it/s]
82%|########1 | 131/160 [00:03<00:00, 34.67it/s]
86%|########6 | 138/160 [00:03<00:00, 37.99it/s]
91%|######### | 145/160 [00:03<00:00, 42.91it/s]
94%|#########3| 150/160 [00:03<00:00, 37.88it/s]
97%|#########6| 155/160 [00:03<00:00, 36.70it/s]
100%|##########| 160/160 [00:03<00:00, 41.68it/s]
average deviation min_exec ... context_size x_name fct
157 0.000030 5.419707e-07 0.000030 ... 232 39260 ddot_omp
158 0.000031 4.931126e-07 0.000030 ... 232 39510 ddot_omp
159 0.000040 3.528745e-05 0.000028 ... 232 39760 ddot_omp
[3 rows x 9 columns]
ddot_omp_static
0%| | 0/160 [00:00<?, ?it/s]
4%|3 | 6/160 [00:00<00:02, 59.37it/s]
9%|8 | 14/160 [00:00<00:02, 61.79it/s]
14%|#4 | 23/160 [00:00<00:01, 70.41it/s]
19%|#9 | 31/160 [00:04<00:26, 4.85it/s]
24%|##3 | 38/160 [00:04<00:17, 7.00it/s]
28%|##7 | 44/160 [00:05<00:20, 5.67it/s]
30%|### | 48/160 [00:06<00:16, 6.90it/s]
34%|###4 | 55/160 [00:06<00:10, 9.87it/s]
38%|###8 | 61/160 [00:06<00:07, 13.15it/s]
41%|####1 | 66/160 [00:06<00:05, 16.22it/s]
44%|####4 | 71/160 [00:08<00:15, 5.92it/s]
47%|####6 | 75/160 [00:09<00:15, 5.32it/s]
51%|##### | 81/160 [00:09<00:10, 7.66it/s]
53%|#####3 | 85/160 [00:09<00:07, 9.46it/s]
56%|#####6 | 90/160 [00:10<00:05, 12.00it/s]
59%|#####8 | 94/160 [00:10<00:06, 10.29it/s]
62%|######2 | 100/160 [00:10<00:04, 14.41it/s]
66%|######6 | 106/160 [00:10<00:02, 19.01it/s]
69%|######8 | 110/160 [00:11<00:02, 19.84it/s]
71%|#######1 | 114/160 [00:11<00:02, 20.19it/s]
74%|#######3 | 118/160 [00:11<00:01, 21.85it/s]
76%|#######6 | 122/160 [00:11<00:02, 18.70it/s]
79%|#######9 | 127/160 [00:11<00:01, 22.34it/s]
81%|########1 | 130/160 [00:12<00:01, 15.71it/s]
83%|########3 | 133/160 [00:13<00:03, 8.83it/s]
86%|########5 | 137/160 [00:13<00:01, 11.57it/s]
88%|########7 | 140/160 [00:13<00:01, 13.49it/s]
89%|########9 | 143/160 [00:13<00:01, 15.25it/s]
91%|#########1| 146/160 [00:13<00:00, 16.60it/s]
93%|#########3| 149/160 [00:13<00:00, 16.28it/s]
95%|#########5| 152/160 [00:13<00:00, 18.02it/s]
98%|#########7| 156/160 [00:13<00:00, 22.16it/s]
99%|#########9| 159/160 [00:14<00:00, 21.07it/s]
100%|##########| 160/160 [00:14<00:00, 11.32it/s]
average deviation min_exec ... context_size x_name fct
157 0.000055 9.863351e-07 0.000054 ... 232 39260 ddot_omp_static
158 0.000166 3.068552e-04 0.000047 ... 232 39510 ddot_omp_static
159 0.000050 7.934247e-07 0.000049 ... 232 39760 ddot_omp_static
[3 rows x 9 columns]
ddot_omp_dyn
0%| | 0/160 [00:00<?, ?it/s]
1%| | 1/160 [00:00<01:11, 2.23it/s]
6%|5 | 9/160 [00:00<00:07, 20.40it/s]
9%|8 | 14/160 [00:00<00:05, 26.63it/s]
12%|#1 | 19/160 [00:00<00:04, 31.25it/s]
15%|#5 | 24/160 [00:00<00:04, 33.29it/s]
18%|#8 | 29/160 [00:01<00:03, 32.77it/s]
21%|## | 33/160 [00:01<00:03, 31.93it/s]
23%|##3 | 37/160 [00:01<00:03, 32.80it/s]
26%|##5 | 41/160 [00:01<00:03, 30.63it/s]
28%|##8 | 45/160 [00:01<00:03, 31.26it/s]
31%|### | 49/160 [00:01<00:03, 29.40it/s]
33%|###3 | 53/160 [00:01<00:03, 27.93it/s]
35%|###5 | 56/160 [00:02<00:03, 26.81it/s]
37%|###6 | 59/160 [00:02<00:06, 14.68it/s]
39%|###8 | 62/160 [00:02<00:07, 13.07it/s]
40%|#### | 64/160 [00:03<00:07, 12.36it/s]
41%|####1 | 66/160 [00:03<00:07, 13.22it/s]
42%|####2 | 68/160 [00:03<00:06, 13.70it/s]
44%|####4 | 71/160 [00:03<00:05, 16.45it/s]
46%|####5 | 73/160 [00:03<00:05, 14.79it/s]
47%|####6 | 75/160 [00:05<00:20, 4.13it/s]
49%|####8 | 78/160 [00:05<00:13, 5.86it/s]
50%|##### | 80/160 [00:05<00:11, 7.04it/s]
51%|#####1 | 82/160 [00:06<00:21, 3.59it/s]
53%|#####3 | 85/160 [00:06<00:14, 5.17it/s]
54%|#####4 | 87/160 [00:07<00:13, 5.30it/s]
56%|#####5 | 89/160 [00:07<00:10, 6.47it/s]
57%|#####6 | 91/160 [00:08<00:18, 3.72it/s]
58%|#####8 | 93/160 [00:08<00:14, 4.57it/s]
59%|#####9 | 95/160 [00:08<00:12, 5.22it/s]
61%|###### | 97/160 [00:09<00:12, 5.11it/s]
62%|######2 | 100/160 [00:09<00:10, 5.91it/s]
63%|######3 | 101/160 [00:09<00:11, 5.19it/s]
64%|######3 | 102/160 [00:10<00:13, 4.36it/s]
64%|######4 | 103/160 [00:10<00:11, 4.76it/s]
66%|######5 | 105/160 [00:10<00:08, 6.63it/s]
67%|######6 | 107/160 [00:10<00:08, 6.14it/s]
68%|######8 | 109/160 [00:10<00:06, 7.93it/s]
69%|######9 | 111/160 [00:11<00:05, 9.49it/s]
71%|####### | 113/160 [00:11<00:07, 6.66it/s]
72%|#######1 | 115/160 [00:11<00:06, 7.36it/s]
73%|#######3 | 117/160 [00:12<00:05, 7.97it/s]
74%|#######4 | 119/160 [00:12<00:04, 9.31it/s]
76%|#######5 | 121/160 [00:12<00:04, 8.84it/s]
77%|#######6 | 123/160 [00:12<00:04, 8.62it/s]
78%|#######8 | 125/160 [00:12<00:03, 9.94it/s]
79%|#######9 | 127/160 [00:12<00:02, 11.16it/s]
81%|######## | 129/160 [00:13<00:02, 11.90it/s]
82%|########1 | 131/160 [00:13<00:02, 9.91it/s]
83%|########3 | 133/160 [00:13<00:02, 11.42it/s]
84%|########4 | 135/160 [00:13<00:01, 12.73it/s]
86%|########5 | 137/160 [00:13<00:01, 13.80it/s]
87%|########6 | 139/160 [00:13<00:01, 14.22it/s]
88%|########8 | 141/160 [00:14<00:01, 11.95it/s]
89%|########9 | 143/160 [00:14<00:01, 10.13it/s]
91%|######### | 145/160 [00:14<00:01, 8.60it/s]
92%|#########1| 147/160 [00:14<00:01, 8.73it/s]
93%|#########3| 149/160 [00:15<00:01, 8.95it/s]
94%|#########4| 151/160 [00:15<00:01, 6.66it/s]
96%|#########5| 153/160 [00:15<00:00, 7.28it/s]
97%|#########6| 155/160 [00:17<00:01, 2.71it/s]
98%|#########7| 156/160 [00:21<00:03, 1.08it/s]
98%|#########8| 157/160 [00:21<00:02, 1.18it/s]
99%|#########9| 159/160 [00:21<00:00, 1.77it/s]
100%|##########| 160/160 [00:21<00:00, 7.33it/s]
average deviation min_exec ... context_size x_name fct
157 0.000133 0.000010 0.000127 ... 232 39260 ddot_omp_dyn
158 0.000135 0.000017 0.000128 ... 232 39510 ddot_omp_dyn
159 0.000134 0.000010 0.000128 ... 232 39760 ddot_omp_dyn
[3 rows x 9 columns]
ddot_omp_cpp
0%| | 0/160 [00:00<?, ?it/s]
8%|8 | 13/160 [00:00<00:01, 124.98it/s]
16%|#6 | 26/160 [00:00<00:01, 120.16it/s]
24%|##4 | 39/160 [00:00<00:01, 115.34it/s]
32%|###1 | 51/160 [00:00<00:00, 110.71it/s]
39%|###9 | 63/160 [00:00<00:00, 106.24it/s]
46%|####6 | 74/160 [00:00<00:00, 102.42it/s]
53%|#####3 | 85/160 [00:00<00:00, 98.69it/s]
59%|#####9 | 95/160 [00:00<00:00, 95.61it/s]
66%|######5 | 105/160 [00:01<00:00, 92.35it/s]
72%|#######1 | 115/160 [00:01<00:00, 89.29it/s]
78%|#######7 | 124/160 [00:01<00:00, 86.61it/s]
83%|########3 | 133/160 [00:01<00:00, 83.68it/s]
89%|########8 | 142/160 [00:01<00:00, 81.10it/s]
94%|#########4| 151/160 [00:01<00:00, 78.54it/s]
99%|#########9| 159/160 [00:01<00:00, 76.40it/s]
100%|##########| 160/160 [00:01<00:00, 91.08it/s]
average deviation min_exec ... context_size x_name fct
157 0.000026 3.135781e-07 0.000026 ... 232 39260 ddot_omp_cpp
158 0.000027 2.956589e-07 0.000027 ... 232 39510 ddot_omp_cpp
159 0.000026 5.323258e-07 0.000026 ... 232 39760 ddot_omp_cpp
[3 rows x 9 columns]
ddot_omp_cpp_16
0%| | 0/160 [00:00<?, ?it/s]
8%|8 | 13/160 [00:00<00:01, 127.78it/s]
16%|#6 | 26/160 [00:00<00:01, 120.54it/s]
24%|##4 | 39/160 [00:00<00:01, 115.17it/s]
32%|###1 | 51/160 [00:00<00:00, 110.40it/s]
39%|###9 | 63/160 [00:00<00:00, 105.89it/s]
46%|####6 | 74/160 [00:00<00:00, 102.24it/s]
53%|#####3 | 85/160 [00:00<00:00, 98.63it/s]
59%|#####9 | 95/160 [00:00<00:00, 95.27it/s]
66%|######5 | 105/160 [00:01<00:00, 92.26it/s]
72%|#######1 | 115/160 [00:01<00:00, 89.21it/s]
78%|#######7 | 124/160 [00:01<00:00, 86.55it/s]
83%|########3 | 133/160 [00:01<00:00, 84.21it/s]
89%|########8 | 142/160 [00:01<00:00, 81.83it/s]
94%|#########4| 151/160 [00:01<00:00, 79.60it/s]
99%|#########9| 159/160 [00:01<00:00, 77.84it/s]
100%|##########| 160/160 [00:01<00:00, 91.60it/s]
average deviation min_exec ... context_size x_name fct
157 0.000026 2.998788e-07 0.000025 ... 232 39260 ddot_omp_cpp_16
158 0.000025 2.916635e-07 0.000025 ... 232 39510 ddot_omp_cpp_16
159 0.000025 4.160573e-07 0.000025 ... 232 39760 ddot_omp_cpp_16
[3 rows x 9 columns]
Let’s display the results¶
cc = concat(dfs)
cc['N'] = cc['x_name']
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index='N', columns='fct', values='average').plot(
logy=True, ax=ax[0, 0])
cc.pivot(index='N', columns='fct', values='average').plot(
logy=True, ax=ax[0, 1])
cc.pivot(index='N', columns='fct', values='average').plot(
logy=True, logx=True, ax=ax[1, 1])
cc[((cc.fct.str.contains('omp') | (cc.fct == 'ddot_array')) &
~cc.fct.str.contains('dyn'))].pivot(
index='N', columns='fct', values='average').plot(
logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations"
"\nwithout dot_product")
plt.show()
Total running time of the script: ( 1 minutes 5.120 seconds)