Compares dot implementations (numpy, c++, sse, openmp)

numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:

import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from td3a_cpp.tutorial.dot_cython import (
    ddot_array_16_sse, ddot_array
)
from td3a_cpp.tutorial.dot_cython_omp import (
    ddot_cython_array_omp,
    ddot_array_openmp,
    get_omp_max_threads,
    ddot_array_openmp_16
)
from td3a_cpp.tools import measure_time_dim


def get_vectors(fct, n, h=250, dtype=numpy.float64):
    ctxs = [dict(va=numpy.random.randn(n).astype(dtype),
                 vb=numpy.random.randn(n).astype(dtype),
                 dot=fct,
                 x_name=n)
            for n in range(10, n, h)]
    return ctxs

Number of threads

print(get_omp_max_threads())
8

Several cython dot

def numpy_dot(va, vb):
    return numpy.dot(va, vb)


def ddot_omp(va, vb):
    return ddot_cython_array_omp(va, vb)


def ddot_omp_static(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=1)


def ddot_omp_dyn(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=2)


def ddot_omp_cpp(va, vb):
    return ddot_array_openmp(va, vb)


def ddot_omp_cpp_16(va, vb):
    return ddot_array_openmp_16(va, vb)


dfs = []
for fct in [numpy_dot,
            ddot_array,
            ddot_array_16_sse,
            ddot_omp,
            ddot_omp_static,
            ddot_omp_dyn,
            ddot_omp_cpp,
            ddot_omp_cpp_16]:
    ctxs = get_vectors(fct, 40000)

    print(fct.__name__)
    df = DataFrame(list(measure_time_dim('dot(va, vb)', ctxs, verbose=1)))
    df['fct'] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))
numpy_dot

  0%|          | 0/160 [00:00<?, ?it/s]
  9%|9         | 15/160 [00:00<00:01, 139.88it/s]
 18%|#8        | 29/160 [00:00<00:01, 116.55it/s]
 26%|##5       | 41/160 [00:00<00:01, 98.21it/s]
 32%|###2      | 52/160 [00:00<00:01, 89.24it/s]
 39%|###8      | 62/160 [00:00<00:01, 83.60it/s]
 44%|####4     | 71/160 [00:00<00:01, 79.25it/s]
 49%|####9     | 79/160 [00:00<00:01, 76.13it/s]
 54%|#####4    | 87/160 [00:01<00:00, 73.73it/s]
 59%|#####9    | 95/160 [00:01<00:00, 71.21it/s]
 64%|######4   | 103/160 [00:01<00:00, 65.50it/s]
 69%|######8   | 110/160 [00:01<00:00, 66.20it/s]
 74%|#######3  | 118/160 [00:01<00:00, 64.51it/s]
 78%|#######8  | 125/160 [00:01<00:00, 56.49it/s]
 82%|########1 | 131/160 [00:01<00:00, 54.07it/s]
 86%|########5 | 137/160 [00:01<00:00, 50.37it/s]
 89%|########9 | 143/160 [00:02<00:00, 51.96it/s]
 93%|#########3| 149/160 [00:02<00:00, 51.18it/s]
 97%|#########6| 155/160 [00:02<00:00, 52.41it/s]
100%|##########| 160/160 [00:02<00:00, 66.88it/s]
      average     deviation  min_exec  ...  context_size  x_name        fct
157  0.000034  3.487521e-07  0.000033  ...           232   39260  numpy_dot
158  0.000033  4.206640e-07  0.000033  ...           232   39510  numpy_dot
159  0.000029  4.578539e-06  0.000024  ...           232   39760  numpy_dot

[3 rows x 9 columns]
ddot_array

  0%|          | 0/160 [00:00<?, ?it/s]
 11%|#         | 17/160 [00:00<00:00, 169.14it/s]
 21%|##1       | 34/160 [00:00<00:01, 112.61it/s]
 29%|##9       | 47/160 [00:00<00:01, 87.53it/s]
 36%|###5      | 57/160 [00:00<00:01, 73.42it/s]
 41%|####      | 65/160 [00:00<00:01, 63.94it/s]
 45%|####5     | 72/160 [00:01<00:01, 56.78it/s]
 49%|####8     | 78/160 [00:01<00:01, 51.15it/s]
 52%|#####2    | 84/160 [00:01<00:01, 46.50it/s]
 56%|#####5    | 89/160 [00:01<00:01, 42.98it/s]
 59%|#####8    | 94/160 [00:01<00:01, 39.59it/s]
 61%|######1   | 98/160 [00:01<00:01, 37.41it/s]
 64%|######3   | 102/160 [00:01<00:01, 35.21it/s]
 66%|######6   | 106/160 [00:02<00:01, 33.44it/s]
 69%|######8   | 110/160 [00:02<00:01, 31.68it/s]
 71%|#######1  | 114/160 [00:02<00:01, 30.28it/s]
 74%|#######3  | 118/160 [00:02<00:01, 28.89it/s]
 76%|#######5  | 121/160 [00:02<00:01, 28.03it/s]
 78%|#######7  | 124/160 [00:02<00:01, 27.05it/s]
 79%|#######9  | 127/160 [00:02<00:01, 26.31it/s]
 81%|########1 | 130/160 [00:02<00:01, 25.41it/s]
 83%|########3 | 133/160 [00:03<00:01, 24.47it/s]
 85%|########5 | 136/160 [00:03<00:01, 23.77it/s]
 87%|########6 | 139/160 [00:03<00:00, 23.17it/s]
 89%|########8 | 142/160 [00:03<00:00, 22.38it/s]
 91%|######### | 145/160 [00:03<00:00, 21.81it/s]
 92%|#########2| 148/160 [00:03<00:00, 21.24it/s]
 94%|#########4| 151/160 [00:03<00:00, 20.82it/s]
 96%|#########6| 154/160 [00:04<00:00, 20.28it/s]
 98%|#########8| 157/160 [00:04<00:00, 19.93it/s]
 99%|#########9| 159/160 [00:04<00:00, 19.79it/s]
100%|##########| 160/160 [00:04<00:00, 35.95it/s]
      average     deviation  min_exec  ...  context_size  x_name         fct
157  0.000103  5.276492e-07  0.000102  ...           232   39260  ddot_array
158  0.000098  4.303988e-07  0.000098  ...           232   39510  ddot_array
159  0.000112  5.036949e-07  0.000111  ...           232   39760  ddot_array

[3 rows x 9 columns]
ddot_array_16_sse

  0%|          | 0/160 [00:00<?, ?it/s]
 12%|#1        | 19/160 [00:00<00:00, 188.75it/s]
 24%|##3       | 38/160 [00:00<00:00, 129.93it/s]
 33%|###3      | 53/160 [00:00<00:01, 101.77it/s]
 41%|####      | 65/160 [00:00<00:01, 85.07it/s]
 47%|####6     | 75/160 [00:00<00:01, 73.99it/s]
 52%|#####1    | 83/160 [00:01<00:01, 66.04it/s]
 56%|#####6    | 90/160 [00:01<00:01, 59.86it/s]
 61%|######    | 97/160 [00:01<00:01, 54.32it/s]
 64%|######4   | 103/160 [00:01<00:01, 50.10it/s]
 68%|######8   | 109/160 [00:01<00:01, 46.48it/s]
 71%|#######1  | 114/160 [00:01<00:01, 43.38it/s]
 74%|#######4  | 119/160 [00:01<00:01, 40.82it/s]
 78%|#######7  | 124/160 [00:02<00:00, 38.36it/s]
 80%|########  | 128/160 [00:02<00:00, 36.67it/s]
 82%|########2 | 132/160 [00:02<00:00, 34.85it/s]
 85%|########5 | 136/160 [00:02<00:00, 33.53it/s]
 88%|########7 | 140/160 [00:02<00:00, 32.03it/s]
 90%|######### | 144/160 [00:02<00:00, 30.89it/s]
 92%|#########2| 148/160 [00:02<00:00, 29.86it/s]
 94%|#########4| 151/160 [00:03<00:00, 29.03it/s]
 96%|#########6| 154/160 [00:03<00:00, 28.04it/s]
 98%|#########8| 157/160 [00:03<00:00, 27.53it/s]
100%|##########| 160/160 [00:03<00:00, 26.77it/s]
100%|##########| 160/160 [00:03<00:00, 47.62it/s]
      average     deviation  min_exec  ...  context_size  x_name                fct
157  0.000079  3.905469e-07  0.000078  ...           232   39260  ddot_array_16_sse
158  0.000075  8.012449e-07  0.000074  ...           232   39510  ddot_array_16_sse
159  0.000079  5.563830e-07  0.000078  ...           232   39760  ddot_array_16_sse

[3 rows x 9 columns]
ddot_omp

  0%|          | 0/160 [00:00<?, ?it/s]
  7%|6         | 11/160 [00:00<00:01, 101.43it/s]
 14%|#3        | 22/160 [00:00<00:03, 40.15it/s]
 18%|#7        | 28/160 [00:00<00:04, 32.44it/s]
 22%|##2       | 36/160 [00:00<00:03, 41.19it/s]
 28%|##8       | 45/160 [00:00<00:02, 51.38it/s]
 32%|###2      | 52/160 [00:01<00:01, 55.63it/s]
 37%|###6      | 59/160 [00:01<00:01, 50.88it/s]
 42%|####1     | 67/160 [00:01<00:01, 57.64it/s]
 46%|####6     | 74/160 [00:01<00:01, 48.11it/s]
 50%|#####     | 80/160 [00:01<00:01, 49.41it/s]
 54%|#####4    | 87/160 [00:01<00:01, 50.00it/s]
 58%|#####8    | 93/160 [00:02<00:02, 28.49it/s]
 63%|######3   | 101/160 [00:02<00:01, 36.11it/s]
 67%|######6   | 107/160 [00:02<00:01, 35.55it/s]
 71%|#######   | 113/160 [00:02<00:01, 38.83it/s]
 74%|#######3  | 118/160 [00:02<00:01, 37.52it/s]
 78%|#######8  | 125/160 [00:02<00:00, 43.69it/s]
 82%|########1 | 131/160 [00:03<00:00, 34.67it/s]
 86%|########6 | 138/160 [00:03<00:00, 37.99it/s]
 91%|######### | 145/160 [00:03<00:00, 42.91it/s]
 94%|#########3| 150/160 [00:03<00:00, 37.88it/s]
 97%|#########6| 155/160 [00:03<00:00, 36.70it/s]
100%|##########| 160/160 [00:03<00:00, 41.68it/s]
      average     deviation  min_exec  ...  context_size  x_name       fct
157  0.000030  5.419707e-07  0.000030  ...           232   39260  ddot_omp
158  0.000031  4.931126e-07  0.000030  ...           232   39510  ddot_omp
159  0.000040  3.528745e-05  0.000028  ...           232   39760  ddot_omp

[3 rows x 9 columns]
ddot_omp_static

  0%|          | 0/160 [00:00<?, ?it/s]
  4%|3         | 6/160 [00:00<00:02, 59.37it/s]
  9%|8         | 14/160 [00:00<00:02, 61.79it/s]
 14%|#4        | 23/160 [00:00<00:01, 70.41it/s]
 19%|#9        | 31/160 [00:04<00:26,  4.85it/s]
 24%|##3       | 38/160 [00:04<00:17,  7.00it/s]
 28%|##7       | 44/160 [00:05<00:20,  5.67it/s]
 30%|###       | 48/160 [00:06<00:16,  6.90it/s]
 34%|###4      | 55/160 [00:06<00:10,  9.87it/s]
 38%|###8      | 61/160 [00:06<00:07, 13.15it/s]
 41%|####1     | 66/160 [00:06<00:05, 16.22it/s]
 44%|####4     | 71/160 [00:08<00:15,  5.92it/s]
 47%|####6     | 75/160 [00:09<00:15,  5.32it/s]
 51%|#####     | 81/160 [00:09<00:10,  7.66it/s]
 53%|#####3    | 85/160 [00:09<00:07,  9.46it/s]
 56%|#####6    | 90/160 [00:10<00:05, 12.00it/s]
 59%|#####8    | 94/160 [00:10<00:06, 10.29it/s]
 62%|######2   | 100/160 [00:10<00:04, 14.41it/s]
 66%|######6   | 106/160 [00:10<00:02, 19.01it/s]
 69%|######8   | 110/160 [00:11<00:02, 19.84it/s]
 71%|#######1  | 114/160 [00:11<00:02, 20.19it/s]
 74%|#######3  | 118/160 [00:11<00:01, 21.85it/s]
 76%|#######6  | 122/160 [00:11<00:02, 18.70it/s]
 79%|#######9  | 127/160 [00:11<00:01, 22.34it/s]
 81%|########1 | 130/160 [00:12<00:01, 15.71it/s]
 83%|########3 | 133/160 [00:13<00:03,  8.83it/s]
 86%|########5 | 137/160 [00:13<00:01, 11.57it/s]
 88%|########7 | 140/160 [00:13<00:01, 13.49it/s]
 89%|########9 | 143/160 [00:13<00:01, 15.25it/s]
 91%|#########1| 146/160 [00:13<00:00, 16.60it/s]
 93%|#########3| 149/160 [00:13<00:00, 16.28it/s]
 95%|#########5| 152/160 [00:13<00:00, 18.02it/s]
 98%|#########7| 156/160 [00:13<00:00, 22.16it/s]
 99%|#########9| 159/160 [00:14<00:00, 21.07it/s]
100%|##########| 160/160 [00:14<00:00, 11.32it/s]
      average     deviation  min_exec  ...  context_size  x_name              fct
157  0.000055  9.863351e-07  0.000054  ...           232   39260  ddot_omp_static
158  0.000166  3.068552e-04  0.000047  ...           232   39510  ddot_omp_static
159  0.000050  7.934247e-07  0.000049  ...           232   39760  ddot_omp_static

[3 rows x 9 columns]
ddot_omp_dyn

  0%|          | 0/160 [00:00<?, ?it/s]
  1%|          | 1/160 [00:00<01:11,  2.23it/s]
  6%|5         | 9/160 [00:00<00:07, 20.40it/s]
  9%|8         | 14/160 [00:00<00:05, 26.63it/s]
 12%|#1        | 19/160 [00:00<00:04, 31.25it/s]
 15%|#5        | 24/160 [00:00<00:04, 33.29it/s]
 18%|#8        | 29/160 [00:01<00:03, 32.77it/s]
 21%|##        | 33/160 [00:01<00:03, 31.93it/s]
 23%|##3       | 37/160 [00:01<00:03, 32.80it/s]
 26%|##5       | 41/160 [00:01<00:03, 30.63it/s]
 28%|##8       | 45/160 [00:01<00:03, 31.26it/s]
 31%|###       | 49/160 [00:01<00:03, 29.40it/s]
 33%|###3      | 53/160 [00:01<00:03, 27.93it/s]
 35%|###5      | 56/160 [00:02<00:03, 26.81it/s]
 37%|###6      | 59/160 [00:02<00:06, 14.68it/s]
 39%|###8      | 62/160 [00:02<00:07, 13.07it/s]
 40%|####      | 64/160 [00:03<00:07, 12.36it/s]
 41%|####1     | 66/160 [00:03<00:07, 13.22it/s]
 42%|####2     | 68/160 [00:03<00:06, 13.70it/s]
 44%|####4     | 71/160 [00:03<00:05, 16.45it/s]
 46%|####5     | 73/160 [00:03<00:05, 14.79it/s]
 47%|####6     | 75/160 [00:05<00:20,  4.13it/s]
 49%|####8     | 78/160 [00:05<00:13,  5.86it/s]
 50%|#####     | 80/160 [00:05<00:11,  7.04it/s]
 51%|#####1    | 82/160 [00:06<00:21,  3.59it/s]
 53%|#####3    | 85/160 [00:06<00:14,  5.17it/s]
 54%|#####4    | 87/160 [00:07<00:13,  5.30it/s]
 56%|#####5    | 89/160 [00:07<00:10,  6.47it/s]
 57%|#####6    | 91/160 [00:08<00:18,  3.72it/s]
 58%|#####8    | 93/160 [00:08<00:14,  4.57it/s]
 59%|#####9    | 95/160 [00:08<00:12,  5.22it/s]
 61%|######    | 97/160 [00:09<00:12,  5.11it/s]
 62%|######2   | 100/160 [00:09<00:10,  5.91it/s]
 63%|######3   | 101/160 [00:09<00:11,  5.19it/s]
 64%|######3   | 102/160 [00:10<00:13,  4.36it/s]
 64%|######4   | 103/160 [00:10<00:11,  4.76it/s]
 66%|######5   | 105/160 [00:10<00:08,  6.63it/s]
 67%|######6   | 107/160 [00:10<00:08,  6.14it/s]
 68%|######8   | 109/160 [00:10<00:06,  7.93it/s]
 69%|######9   | 111/160 [00:11<00:05,  9.49it/s]
 71%|#######   | 113/160 [00:11<00:07,  6.66it/s]
 72%|#######1  | 115/160 [00:11<00:06,  7.36it/s]
 73%|#######3  | 117/160 [00:12<00:05,  7.97it/s]
 74%|#######4  | 119/160 [00:12<00:04,  9.31it/s]
 76%|#######5  | 121/160 [00:12<00:04,  8.84it/s]
 77%|#######6  | 123/160 [00:12<00:04,  8.62it/s]
 78%|#######8  | 125/160 [00:12<00:03,  9.94it/s]
 79%|#######9  | 127/160 [00:12<00:02, 11.16it/s]
 81%|########  | 129/160 [00:13<00:02, 11.90it/s]
 82%|########1 | 131/160 [00:13<00:02,  9.91it/s]
 83%|########3 | 133/160 [00:13<00:02, 11.42it/s]
 84%|########4 | 135/160 [00:13<00:01, 12.73it/s]
 86%|########5 | 137/160 [00:13<00:01, 13.80it/s]
 87%|########6 | 139/160 [00:13<00:01, 14.22it/s]
 88%|########8 | 141/160 [00:14<00:01, 11.95it/s]
 89%|########9 | 143/160 [00:14<00:01, 10.13it/s]
 91%|######### | 145/160 [00:14<00:01,  8.60it/s]
 92%|#########1| 147/160 [00:14<00:01,  8.73it/s]
 93%|#########3| 149/160 [00:15<00:01,  8.95it/s]
 94%|#########4| 151/160 [00:15<00:01,  6.66it/s]
 96%|#########5| 153/160 [00:15<00:00,  7.28it/s]
 97%|#########6| 155/160 [00:17<00:01,  2.71it/s]
 98%|#########7| 156/160 [00:21<00:03,  1.08it/s]
 98%|#########8| 157/160 [00:21<00:02,  1.18it/s]
 99%|#########9| 159/160 [00:21<00:00,  1.77it/s]
100%|##########| 160/160 [00:21<00:00,  7.33it/s]
      average  deviation  min_exec  ...  context_size  x_name           fct
157  0.000133   0.000010  0.000127  ...           232   39260  ddot_omp_dyn
158  0.000135   0.000017  0.000128  ...           232   39510  ddot_omp_dyn
159  0.000134   0.000010  0.000128  ...           232   39760  ddot_omp_dyn

[3 rows x 9 columns]
ddot_omp_cpp

  0%|          | 0/160 [00:00<?, ?it/s]
  8%|8         | 13/160 [00:00<00:01, 124.98it/s]
 16%|#6        | 26/160 [00:00<00:01, 120.16it/s]
 24%|##4       | 39/160 [00:00<00:01, 115.34it/s]
 32%|###1      | 51/160 [00:00<00:00, 110.71it/s]
 39%|###9      | 63/160 [00:00<00:00, 106.24it/s]
 46%|####6     | 74/160 [00:00<00:00, 102.42it/s]
 53%|#####3    | 85/160 [00:00<00:00, 98.69it/s]
 59%|#####9    | 95/160 [00:00<00:00, 95.61it/s]
 66%|######5   | 105/160 [00:01<00:00, 92.35it/s]
 72%|#######1  | 115/160 [00:01<00:00, 89.29it/s]
 78%|#######7  | 124/160 [00:01<00:00, 86.61it/s]
 83%|########3 | 133/160 [00:01<00:00, 83.68it/s]
 89%|########8 | 142/160 [00:01<00:00, 81.10it/s]
 94%|#########4| 151/160 [00:01<00:00, 78.54it/s]
 99%|#########9| 159/160 [00:01<00:00, 76.40it/s]
100%|##########| 160/160 [00:01<00:00, 91.08it/s]
      average     deviation  min_exec  ...  context_size  x_name           fct
157  0.000026  3.135781e-07  0.000026  ...           232   39260  ddot_omp_cpp
158  0.000027  2.956589e-07  0.000027  ...           232   39510  ddot_omp_cpp
159  0.000026  5.323258e-07  0.000026  ...           232   39760  ddot_omp_cpp

[3 rows x 9 columns]
ddot_omp_cpp_16

  0%|          | 0/160 [00:00<?, ?it/s]
  8%|8         | 13/160 [00:00<00:01, 127.78it/s]
 16%|#6        | 26/160 [00:00<00:01, 120.54it/s]
 24%|##4       | 39/160 [00:00<00:01, 115.17it/s]
 32%|###1      | 51/160 [00:00<00:00, 110.40it/s]
 39%|###9      | 63/160 [00:00<00:00, 105.89it/s]
 46%|####6     | 74/160 [00:00<00:00, 102.24it/s]
 53%|#####3    | 85/160 [00:00<00:00, 98.63it/s]
 59%|#####9    | 95/160 [00:00<00:00, 95.27it/s]
 66%|######5   | 105/160 [00:01<00:00, 92.26it/s]
 72%|#######1  | 115/160 [00:01<00:00, 89.21it/s]
 78%|#######7  | 124/160 [00:01<00:00, 86.55it/s]
 83%|########3 | 133/160 [00:01<00:00, 84.21it/s]
 89%|########8 | 142/160 [00:01<00:00, 81.83it/s]
 94%|#########4| 151/160 [00:01<00:00, 79.60it/s]
 99%|#########9| 159/160 [00:01<00:00, 77.84it/s]
100%|##########| 160/160 [00:01<00:00, 91.60it/s]
      average     deviation  min_exec  ...  context_size  x_name              fct
157  0.000026  2.998788e-07  0.000025  ...           232   39260  ddot_omp_cpp_16
158  0.000025  2.916635e-07  0.000025  ...           232   39510  ddot_omp_cpp_16
159  0.000025  4.160573e-07  0.000025  ...           232   39760  ddot_omp_cpp_16

[3 rows x 9 columns]

Let’s display the results

cc = concat(dfs)
cc['N'] = cc['x_name']

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index='N', columns='fct', values='average').plot(
    logy=True, ax=ax[0, 0])
cc.pivot(index='N', columns='fct', values='average').plot(
    logy=True, ax=ax[0, 1])
cc.pivot(index='N', columns='fct', values='average').plot(
    logy=True, logx=True, ax=ax[1, 1])
cc[((cc.fct.str.contains('omp') | (cc.fct == 'ddot_array')) &
    ~cc.fct.str.contains('dyn'))].pivot(
        index='N', columns='fct', values='average').plot(
    logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations"
                   "\nwithout dot_product")

plt.show()
Comparison of cython ddot implementations, Comparison of cython ddot implementations without dot_product

Total running time of the script: ( 1 minutes 5.120 seconds)

Gallery generated by Sphinx-Gallery