

# Compares matrix multiplication implementations with timeit

:epkg:`numpy` has a very fast implementation of
matrix multiplication. There are many ways to be slower.
The following uses :mod:`timeit` to compare implementations.

Compared implementations:

* :func:`multiply_matrix <td3a_cpp.tutorial.td_mul_cython.multiply_matrix>`
  [code](https://github.com/sdpython/td3a_cpp/blob/master/
  td3a_cpp/tutorial/td_mul_cython.pyx#L14)
* :func:`c_multiply_matrix <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix>`
  [code](https://github.com/sdpython/td3a_cpp/blob/master/
  td3a_cpp/tutorial/td_mul_cython.pyx#L69)
* :func:`c_multiply_matrix_parallel
  <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix_parallel>`
  [code](https://github.com/sdpython/td3a_cpp/blob/master/
  td3a_cpp/tutorial/td_mul_cython.pyx#L49)
* :func:`c_multiply_matrix_parallel_transposed
  <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix_parallel_transposed>`
  [code](https://github.com/sdpython/td3a_cpp/blob/master/
  td3a_cpp/tutorial/td_mul_cython.pyx#L106)

## Preparation


In [None]:
import timeit
import numpy

from td3a_cpp.tutorial.td_mul_cython import (
    multiply_matrix, c_multiply_matrix,
    c_multiply_matrix_parallel,
    c_multiply_matrix_parallel_transposed as cmulparamtr)


va = numpy.random.randn(150, 100).astype(numpy.float64)
vb = numpy.random.randn(100, 100).astype(numpy.float64)
ctx = {
    'va': va, 'vb': vb, 'c_multiply_matrix': c_multiply_matrix,
    'multiply_matrix': multiply_matrix,
    'c_multiply_matrix_parallel': c_multiply_matrix_parallel,
    'c_multiply_matrix_parallel_transposed': cmulparamtr}

## Measures

numpy



In [None]:
res0 = timeit.timeit('va @ vb', number=100, globals=ctx)
print("numpy time", res0)

python implementation



In [None]:
res1 = timeit.timeit(
    'multiply_matrix(va, vb)', number=10, globals=ctx)
print('python implementation', res1)

cython implementation



In [None]:
res2 = timeit.timeit(
    'c_multiply_matrix(va, vb)', number=100, globals=ctx)
print('cython implementation', res2)

cython implementation parallelized



In [None]:
res3 = timeit.timeit(
    'c_multiply_matrix_parallel(va, vb)', number=100, globals=ctx)
print('cython implementation parallelized', res3)

cython implementation parallelized, AVX + transposed



In [None]:
res4 = timeit.timeit(
    'c_multiply_matrix_parallel_transposed(va, vb)', number=100, globals=ctx)
print('cython implementation parallelized avx', res4)

Speed up...



In [None]:
print(f"numpy is {res1 / res0:f} faster than pure python.")
print(f"numpy is {res2 / res0:f} faster than cython.")
print(f"numpy is {res3 / res0:f} faster than parallelized cython.")
print(f"numpy is {res4 / res0:f} faster than avx parallelized cython.")