Optimisation de code avec cffi, numba, cython

Links: notebook, html, PDF, python, slides, GitHub

L’idée est de recoder une fonction en C. On prend comme exemple la fonction de prédiction de la régression linéaire de scikit-learn et de prévoir le gain de temps qu’on obtient en recodant la fonction dans un langage plus rapide.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
memo_time = []
import timeit

def unit(x):
    if x >= 1: return "%1.2f s" % x
    elif x >= 1e-3: return "%1.2f ms" % (x* 1000)
    elif x >= 1e-6: return "%1.2f µs" % (x* 1000**2)
    elif x >= 1e-9: return "%1.2f ns" % (x* 1000**3)
    else:
        return "%1.2g s" % x

def timeexe(legend, code, number=100, repeat=1000):
    rep = timeit.repeat(code, number=number, repeat=repeat, globals=globals())
    ave = sum(rep) / (number * repeat)
    std = (sum((x/number - ave)**2 for x in rep) / repeat)**0.5
    fir = rep[0]/number
    fir3 = sum(rep[:3]) / (3 * number)
    las3 = sum(rep[-3:]) / (3 * number)
    rep.sort()
    mini = rep[len(rep)//20] / number
    maxi = rep[-len(rep)//20] / number
    print("Moyenne: %s Ecart-type %s (with %d runs) in [%s, %s]" % (
                unit(ave), unit(std), number, unit(mini), unit(maxi)))
    return dict(legend=legend, average=ave, deviation=std, first=fir, first3=fir3,
                last3=las3, repeat=repeat, min5=mini, max5=maxi, code=code, run=number)

Régression linéaire

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(diabetes_X_train, diabetes_y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
clr.coef_
array([ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
       -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
        7.43519617e+02,  7.60951722e+01])
clr.intercept_
152.76430691633442
z = diabetes_X_test[0:1,:]
memo_time.append(timeexe("sklearn.predict", "clr.predict(z)"))
Moyenne: 60.98 µs Ecart-type 15.76 µs (with 100 runs) in [44.42 µs, 92.97 µs]
%timeit clr.predict(z)
58.3 µs ± 3.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

optimisation avec cffi

On s’inspire de l’exemple Purely for performance (API level, out-of-line).

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg(int, double *, double *, double, double *);")

ffibuilder.set_source("_linear_regression",
r"""
    static int linreg(int dimension, double * x, double *coef, double intercept, double * out)
    {
        for(; dimension > 0; --dimension, ++x, ++coef)
            intercept += *x * *coef;
        *out = intercept;
        return 1;
    }
""")

ffibuilder.compile(verbose=True)
generating ._linear_regression.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0cppwinrt" /Tc_linear_regression.c /Fo.Release_linear_regression.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0umx64" /EXPORT:PyInit__linear_regression .Release_linear_regression.obj /OUT:._linear_regression.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression.cp37-win_amd64.pyd'

La fonction compilée est accessible comme suit.

from _linear_regression import ffi, lib
lib.linreg
<function _linear_regression.CompiledLib.linreg>

On s’inspire de l’exemple How to pass a Numpy array into a cffi function and how to get one back out?.

import numpy
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
x = diabetes_X_test[0:1,:]
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
n = len(clr.coef_)
lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)
1
out
array([197.61846908])

On vérifie qu’on obtient bien la même chose.

clr.predict(x)
array([197.61846908])

Et on mesure le temps d’exécution :

memo_time.append(timeexe("cffi-linreg", "lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)"))
Moyenne: 794.61 ns Ecart-type 417.41 ns (with 100 runs) in [462.23 ns, 1.37 µs]

C’est beaucoup plus rapide. Pour être totalement honnête, il faut mesurer les étapes qui consiste à extraire les pointeurs.

def predict_clr(x, clr):
    out = numpy.zeros(1)
    ptr_coef = clr.coef_.__array_interface__['data'][0]
    cptr_coef = ffi.cast ( "double*" , ptr_coef )
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "double*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "double*" , ptr_out )
    lib.linreg(len(x), cptr_x, cptr_coef, clr.intercept_, cptr_out)
    return out

predict_clr(x, clr)
array([152.74058378])
memo_time.append(timeexe("cffi-linreg-wrapped", "predict_clr(x, clr)"))
Moyenne: 16.11 µs Ecart-type 6.78 µs (with 100 runs) in [8.44 µs, 27.62 µs]

Cela reste plus rapide.

cffi - seconde version

Comme on construit la fonction en dynamique (le code est connu lors de l’exécution), on peut facilement se passer de la boucle et écrire le code sans boucle et avec les coefficients.

res = " + ".join("{0}*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'0.3034995490660432*x[0] + -237.63931533353403*x[1] + 510.5306054362253*x[2] + 327.7369804093466*x[3] + -814.1317093725387*x[4] + 492.81458798373217*x[5] + 102.8484521916802*x[6] + 184.60648905984*x[7] + 743.519616750542*x[8] + 76.09517221662392*x[9]'
code = """
    static int linreg_custom(double * x, double * out)
    {{
        out[0] = {0} + {1};
    }}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom(double * x, double * out)
{
    out[0] = 152.76430691633442 + 0.3034995490660432*x[0] + -237.63931533353403*x[1] + 510.5306054362253*x[2] + 327.7369804093466*x[3] + -814.1317093725387*x[4] + 492.81458798373217*x[5] + 102.8484521916802*x[6] + 184.60648905984*x[7] + 743.519616750542*x[8] + 76.09517221662392*x[9];
}
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom(double *, double *);")
ffibuilder.set_source("_linear_regression_custom", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0cppwinrt" /Tc_linear_regression_custom.c /Fo.Release_linear_regression_custom.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0umx64" /EXPORT:PyInit__linear_regression_custom .Release_linear_regression_custom.obj /OUT:._linear_regression_custom.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom.cp37-win_amd64.pyd'
from _linear_regression_custom.lib import linreg_custom
linreg_custom(cptr_x, cptr_out)
out
array([197.61846908])
memo_time.append(timeexe("cffi-linreg-custom", "linreg_custom(cptr_x, cptr_out)"))
Moyenne: 694.52 ns Ecart-type 592.96 ns (with 100 runs) in [395.06 ns, 1.19 µs]

On a gagné un facteur 2.

def predict_clr_custom(x):
    out = numpy.zeros(1)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "double*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "double*" , ptr_out )
    linreg_custom(cptr_x, cptr_out)
    return out

predict_clr_custom(x)
array([197.61846908])
memo_time.append(timeexe("cffi-linreg-custom wrapped", "predict_clr_custom(x)"))
Moyenne: 9.71 µs Ecart-type 2.86 µs (with 100 runs) in [5.88 µs, 16.23 µs]

C’est un peu plus rapide.

et en float?

L’ordinateur fait la distinction entre les double code sur 64 bit et les float codé sur 32 bits. La précision est meilleure dans le premier cas et les calculs sont plus rapides dans le second. Dans le cas du machine learning, on préfère la rapidité à une perte précision en précision qui est souvent compensée par l’optimisation inhérente à tout problème de machine learning. Ce qu’on perd sur une observation, on le retrouve sur une autre.

res = " + ".join("{0}f*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'0.3034995490660432f*x[0] + -237.63931533353403f*x[1] + 510.5306054362253f*x[2] + 327.7369804093466f*x[3] + -814.1317093725387f*x[4] + 492.81458798373217f*x[5] + 102.8484521916802f*x[6] + 184.60648905984f*x[7] + 743.519616750542f*x[8] + 76.09517221662392f*x[9]'
code = """
    static int linreg_custom_float(float * x, float * out)
    {{
        out[0] = {0}f + {1};
    }}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom_float(float * x, float * out)
{
    out[0] = 152.76430691633442f + 0.3034995490660432f*x[0] + -237.63931533353403f*x[1] + 510.5306054362253f*x[2] + 327.7369804093466f*x[3] + -814.1317093725387f*x[4] + 492.81458798373217f*x[5] + 102.8484521916802f*x[6] + 184.60648905984f*x[7] + 743.519616750542f*x[8] + 76.09517221662392f*x[9];
}
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0cppwinrt" /Tc_linear_regression_custom_float.c /Fo.Release_linear_regression_custom_float.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0umx64" /EXPORT:PyInit__linear_regression_custom_float .Release_linear_regression_custom_float.obj /OUT:._linear_regression_custom_float.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float.cp37-win_amd64.pyd'
from _linear_regression_custom_float.lib import linreg_custom_float
def predict_clr_custom_float(x):
    out = numpy.zeros(1, dtype=numpy.float32)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "float*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "float*" , ptr_out )
    linreg_custom_float(cptr_x, cptr_out)
    return out

Avant d’appeler la fonction, on doit transformer le vecteur iniatial en float32.

x32 = x.astype(numpy.float32)
predict_clr_custom(x32)
array([152.76430692])
memo_time.append(timeexe("cffi-linreg-custom-float wrapped", "predict_clr_custom(x32)"))
Moyenne: 12.26 µs Ecart-type 5.06 µs (with 100 runs) in [6.64 µs, 22.58 µs]

La différence n’est pas flagrante. Mesurons le code C uniquement même si la partie Python ne peut pas être complètement évitée.

out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

memo_time.append(timeexe("cffi-linreg-custom-float32", "linreg_custom_float(cptr_x, cptr_out)"))
Moyenne: 1.12 µs Ecart-type 1.17 µs (with 100 runs) in [691.36 ns, 2.34 µs]

La différence n’est pas significative.

SIMD

C’est un ensemble d’instructions processeur pour faire des opérations terme à terme sur 4 float32 aussi rapidement qu’une seule. Le processeur ne peut faire des opérations que les nombres sont copiés dans ses registres. Le programme passe alors son temps à copier des nombres depuis la mémoire vers les registres du processeur puis à faire la copie dans le chemin inverse pour le résultat. Les instructions SIMD font gagner du temps du niveau du calcul. Au lieu de faire 4 opérations de multiplication terme à terme, il n’en fait plus qu’une. Il suffit de savoir comment utiliser ces instructions. Avec Visual Studio, elles sont accessible via ces fonctions Memory and Initialization Using Streaming SIMD Extensions. Le code suivant n’est probablement pas optimal mais il n’est pas trop compliqué à suivre.

code = """
#include <xmmintrin.h>

static int linreg_custom_float_simd(float * x, float * out)
{
    __m128 c1 = _mm_set_ps(0.3034995490664121f, -237.63931533353392f, 510.5306054362245f, 327.7369804093466f);
    __m128 c2 = _mm_set_ps(-814.1317093725389f, 492.81458798373245f, 102.84845219168025f, 184.60648905984064f);
    __m128 r1 = _mm_set_ss(152.76430691633442f);
    r1 = _mm_add_ss(r1, _mm_mul_ps(c1, _mm_load_ps(x)));
    r1 = _mm_add_ss(r1, _mm_mul_ps(c2, _mm_load_ps(x+4)));
    float r[4];
    _mm_store_ps(r, r1);
    out[0] = r[0] + r[1] + r[2] + r[3] + 743.5196167505419f * x[8] + 76.095172216624f * x[9];
    return 1;
}
"""
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float_simd(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float_simd", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float_simd.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float_simd' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17134.0cppwinrt" /Tc_linear_regression_custom_float_simd.c /Fo.Release_linear_regression_custom_float_simd.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.15.26726libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17134.0umx64" /EXPORT:PyInit__linear_regression_custom_float_simd .Release_linear_regression_custom_float_simd.obj /OUT:._linear_regression_custom_float_simd.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float_simd.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float_simd.cp37-win_amd64.pyd'
from _linear_regression_custom_float_simd.lib import linreg_custom_float_simd
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

linreg_custom_float_simd(cptr_x, cptr_out)
out
array([171.1178], dtype=float32)
memo_time.append(timeexe("cffi-linreg-custom-float32-simd", "linreg_custom_float_simd(cptr_x, cptr_out)"))
Moyenne: 550.92 ns Ecart-type 380.50 ns (with 100 runs) in [355.55 ns, 813.83 ns]

C’est légèrement mieux, quelques références :

Les processeurs évoluent au fil du temps, 4 float, 8 float, SIMD2, FMA4 Intrinsics Added for Visual Studio 2010 SP1, AVX.

Réécriture purement Python

On continue avec uniquement du Python sans numpy.

coef = clr.coef_
list(coef)
[0.3034995490660432,
 -237.63931533353403,
 510.5306054362253,
 327.7369804093466,
 -814.1317093725387,
 492.81458798373217,
 102.8484521916802,
 184.60648905984,
 743.519616750542,
 76.09517221662392]
code = str(clr.intercept_) + "+" + "+".join("x[{0}]*({1})".format(i, c) for i, c in enumerate(coef))
code
'152.76430691633442+x[0]*(0.3034995490660432)+x[1]*(-237.63931533353403)+x[2]*(510.5306054362253)+x[3]*(327.7369804093466)+x[4]*(-814.1317093725387)+x[5]*(492.81458798373217)+x[6]*(102.8484521916802)+x[7]*(184.60648905984)+x[8]*(743.519616750542)+x[9]*(76.09517221662392)'
def predict_clr_python(x):
    return 152.764306916+x[0]*0.3034995490664121+x[1]*(-237.63931533353392)+x[2]*510.5306054362245+ \
            x[3]*327.7369804093466+ \
            x[4]*(-814.1317093725389)+x[5]*492.81458798373245+x[6]*102.84845219168025+ \
            x[7]*184.60648905984064+x[8]*743.5196167505419+x[9]*76.095172216624

predict_clr_python(x[0])
197.61846907469848
z = list(x[0])
memo_time.append(timeexe("python-linreg-custom", "predict_clr_python(z)"))
Moyenne: 5.20 µs Ecart-type 2.36 µs (with 100 runs) in [2.39 µs, 9.22 µs]

De façon assez surprenante, c’est plutôt rapide. Et si on y mettait une boucle.

def predict_clr_python_loop(x, coef, intercept):
    return intercept + sum(a*b for a, b in zip(x, coef))

predict_clr_python_loop(x[0], list(clr.coef_), clr.intercept_)
197.61846907503298
coef = list(clr.coef_)
intercept = clr.intercept_
memo_time.append(timeexe("python-linreg", "predict_clr_python_loop(z, coef, intercept)"))
Moyenne: 8.13 µs Ecart-type 3.23 µs (with 100 runs) in [4.15 µs, 13.61 µs]

A peine plus long.

Réécriture avec Python et numpy

def predict_clr_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numpy(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numpy-linreg-numpy", "predict_clr_numpy(z, coef, clr.intercept_)"))
Moyenne: 15.41 µs Ecart-type 4.78 µs (with 100 runs) in [8.38 µs, 24.64 µs]

Les dimensions des tableaux sont trop petites pour que le calcul matriciel apporte une différence. On se retrouve dans le cas cffi où les échanges Python - C grignotent tout le temps de calcul.

numba

numba essaye de compiler à la volée des bouts de codes écrits en Python. On induque quelle fonction optimiser en faisant précéder la fonction de @jit. Toutes les écritures ne fonctionnent, typiquement, certaines listes en compréhension soulèvent une exception. Il faut donc écrire son code en Python d’une façon assez proche de ce qu’il serait en C.

from numba import jit
@jit
def predict_clr_numba(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba(z, clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-notype", "predict_clr_numba(z, clr.coef_, clr.intercept_)"))
Moyenne: 47.39 µs Ecart-type 12.68 µs (with 100 runs) in [33.73 µs, 73.24 µs]

Plutôt rapide !

@jit('double(double[:], double[:], double)')
def predict_clr_numba_cast(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-type", "predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 1.73 µs Ecart-type 1.25 µs (with 100 runs) in [841.48 ns, 3.49 µs]

On voit que plus on donne d’information au compilateur, plus il est capable d’optimiser.

@jit('float32(float32[:], float32[:], float32)')
def predict_clr_numba_cast_float(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
x32 = x[0].astype(numpy.float32)
c32 = clr.coef_.astype(numpy.float32)
i32 = numpy.float32(clr.intercept_)
predict_clr_numba_cast_float(x32, c32, i32)
197.61846923828125
memo_time.append(timeexe("numba-linreg-type-float32", "predict_clr_numba_cast_float(x32, c32, i32)"))
Moyenne: 1.19 µs Ecart-type 620.34 ns (with 100 runs) in [628.15 ns, 2.18 µs]

On essaye avec les coefficients dans la fonction.

@jit('double(double[:])')
def predict_clr_numba_cast_custom(x):
    coef = [ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
            -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
             7.43519617e+02,  7.60951722e+01]
    s = 152.76430691633442
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba_cast_custom(x[0])
197.61846907190048
memo_time.append(timeexe("numba-linreg-type-custom", "predict_clr_numba_cast_custom(x[0])"))
Moyenne: 1.59 µs Ecart-type 706.24 ns (with 100 runs) in [813.83 ns, 2.93 µs]

On se rapproche des temps obtenus avec cffi sans wrapping, cela signifie que numba fait un bien meilleur travail à ce niveau que le wrapper rapidement créé.

@jit('double(double[:], double[:], double)')
def predict_clr_numba_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-type-numpy", "predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 12.51 µs Ecart-type 5.14 µs (with 100 runs) in [6.95 µs, 20.01 µs]

numba est moins performant quand numpy est impliqué car le code de numpy n’est pas réécrit, il est appelé.

cython

cython permet de créer des extensions C de plus grande envergure que numba. C’est l’option choisie par scikit-learn. Il vaut mieux connaître le C pour s’en servir et là encore, l’objectif est de réduire les échanges Python / C qui coûtent cher.

%load_ext cython
%%cython
def predict_clr_cython(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s
predict_clr_cython(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("cython-linreg", "predict_clr_cython(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 5.29 µs Ecart-type 2.55 µs (with 100 runs) in [2.62 µs, 8.99 µs]

Cython fait moins bien que numba dans notre cas et l’optimisation proposée est assez proche du temps déjà obtenue avec le langage Python seul. Cela est dû au fait que la plupart des objets tels que du code associé aux listes ou aux dictionnaires ont été réécrits en C.

%%cython
cimport numpy as npc

def predict_clr_cython_type(npc.ndarray[double, ndim=1, mode='c'] x,
                            npc.ndarray[double, ndim=1, mode='c'] coef,
                            double intercept):
    cdef double s = intercept
    for i in range(0, x.shape[0]):
        s += coef[i] * x[i]
    return s
predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("cython-linreg-type", "predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 2.68 µs Ecart-type 892.33 ns (with 100 runs) in [1.57 µs, 3.96 µs]

Le temps est quasi identique avec un écart type moins grand de façon significative.

Une dernière option : ONNX

ONNX est un format de sérialisation qui permet de décrire un modèle de modèle de machine learning ou de deep learning. Cela permet de dissocer le modèle de la librairie qui a servi à le produire (voir ML.net and ONNX.

try:
    from onnxmltools import convert_sklearn
    from onnxmltools.utils import save_model
    from onnxmltools.convert.common.data_types import FloatTensorType
    import onnxruntime
    import onnx
    ok_onnx = True
    print("onnx, onnxmltools, onnxruntime sont disponibles.")
except ImportError as e:
    print("La suite requiert onnx, onnxmltools et onnxruntime.")
    print(e)
    ok_onnx = False
onnx, onnxmltools, onnxruntime sont disponibles.

On convertit le modèle au format ONNX.

if ok_onnx:
    onnx_model = convert_sklearn(clr, 'model', [('input', FloatTensorType([1, clr.coef_.shape[0]]))])
    save_model(onnx_model, 'model.onnx')

    model_onnx = onnx.load('model.onnx')
    print("Modèle sérialisé au format ONNX")
    print(model_onnx)
else:
    print("onnx, onnxmltools, onnxruntime sont disponibles.")
Modèle sérialisé au format ONNX
ir_version: 3
producer_name: "OnnxMLTools"
producer_version: "1.3.0.1000"
domain: "onnxml"
model_version: 0
doc_string: ""
graph {
  node {
    input: "input"
    output: "variable"
    name: "LinearRegressor"
    op_type: "LinearRegressor"
    attribute {
      name: "coefficients"
      floats: 0.3034995496273041
      floats: -237.63931274414062
      floats: 510.5306091308594
      floats: 327.7369689941406
      floats: -814.1317138671875
      floats: 492.8145751953125
      floats: 102.84844970703125
      floats: 184.6064910888672
      floats: 743.5195922851562
      floats: 76.09516906738281
      type: FLOATS
    }
    attribute {
      name: "intercepts"
      floats: 152.76431274414062
      type: FLOATS
    }
    domain: "ai.onnx.ml"
  }
  name: "model"
  input {
    name: "input"
    type {
      tensor_type {
        elem_type: FLOAT
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 10
          }
        }
      }
    }
  }
  output {
    name: "variable"
    type {
      tensor_type {
        elem_type: FLOAT
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 1
          }
        }
      }
    }
  }
}
opset_import {
  domain: "ai.onnx.ml"
  version: 1
}

On calcule les prédictions. Le module {onnxruntime](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) optimise les calculs pour des modèles de deep learning. Cela explique pourquoi tous les calculs sont réalisés avec des réels représentés sur 4 octets numpy.float32.

if ok_onnx:
    sess = onnxruntime.InferenceSession("model.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt(x):
        return sess.run(["variable"], {'input': x})

    print("Prediction:", predict_onnxrt(x.astype(numpy.float32)))
Input: NodeArg(name='input', type='tensor(float)', shape=[1, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[1, 1])
Prediction: [array([[197.61847]], dtype=float32)]
if ok_onnx:
    x32 = x.astype(numpy.float32)
    memo_time.append(timeexe("onnxruntime-float32", "predict_onnxrt(x32)"))
    memo_time.append(timeexe("onnxruntime-float64", "predict_onnxrt(x.astype(numpy.float32))"))
Moyenne: 26.25 µs Ecart-type 8.79 µs (with 100 runs) in [16.11 µs, 42.13 µs]
Moyenne: 32.46 µs Ecart-type 7.86 µs (with 100 runs) in [23.06 µs, 48.51 µs]

NimbusML

NimbusML est un wrapper python de la librairie ML.net qui propose différents algorithmes d’apprentissage et dont l’implémentation s’appuie pour une grande part sur du C# et du C++.

try:
    from nimbusml.linear_model import OrdinaryLeastSquaresRegressor
    ok_nimbus = True
except ImportError:
    ok_nimbus = False
if ok_nimbus:
    nlr = OrdinaryLeastSquaresRegressor()
    nlr.fit(diabetes_X_train, diabetes_y_train)
Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Trainer solving for 11 parameters across 422 examples
Coefficient of determination R2 = 0.512226221150138, or 0.500358245995641 (adjusted)
Not training a calibrator because it is not needed.
Elapsed time: 00:00:01.9415383

Les coefficients sont acessibles via la méthode summary.

if ok_nimbus:
    df = nlr.summary()
    print(df)
         Bias  Weights.F0  Weights.F1  Weights.F2  Weights.F3  Weights.F4  0  152.764313    0.033607  -12.043589   87.073662   43.275772 -125.306023
   Weights.F5  Weights.F6  Weights.F7  Weights.F8  Weights.F9
0   97.965599   18.633986   34.195492   99.333458   10.483422
if ok_nimbus:
    x = diabetes_X_test[0:1]
    print(nlr.predict(x))
0    197.618484
Name: Score, dtype: float32
if ok_nimbus:
    memo_time.append(timeexe("nimbusml.predict", "nlr.predict(x)", repeat=5, number=5))
Moyenne: 461.28 ms Ecart-type 59.72 ms (with 5 runs) in [404.02 ms, 557.38 ms]

Le module nimbusml est optimisé pour l’apprentissage et les prédictions par batch.

csharpyml

csharpyml est un wrapper autour de ML.net utilisant pythonnet.

from pandas import DataFrame
conc = DataFrame(diabetes_X_train.astype(numpy.float32))
features = conc.columns = ["F%d" % i for i in range(0, conc.shape[1])]
conc["Label"] = diabetes_y_train.astype(numpy.float32)
conc.head(n=2)
F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 Label
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019908 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068330 -0.092204 75.0
concatation = f"concat{{col=Feature:{','.join(features)}}}"
concatation
'concat{col=Feature:F0,F1,F2,F3,F4,F5,F6,F7,F8,F9}'
try:
    from csharpyml.binaries import CSPipeline
    ok_csharpyml = True
except ImportError:
    ok_csharpyml = False
if ok_csharpyml:
    pipe = CSPipeline([concatation], "ols")
    pipe.fit(conc, feature="Feature", label="Label")
if ok_csharpyml:
    conc_test = DataFrame(diabetes_X_test.astype(numpy.float32))
    conc_test.columns = ["F%d" % i for i in range(0, conc_test.shape[1])]
    # Le label est requis car le pipeline interne suppose qu'il y est...
    conc_test["Label"] = numpy.zeros((conc_test.shape[0],), dtype=numpy.float32)
    print(pipe.predict(conc_test).head(n=2))
         F0       F1        F2        F3        F4        F5        F6  0 -0.078165  0.05068  0.077863  0.052858  0.078236  0.064447  0.026550
1  0.009016  0.05068 -0.039618  0.028758  0.038334  0.073529 -0.072854
         F7        F8        F9     ...      Feature.1  Feature.2  Feature.3  0 -0.002592  0.040672 -0.009362     ...        0.05068   0.077863   0.052858
1  0.108111  0.015567 -0.046641     ...        0.05068  -0.039618   0.028758
   Feature.4  Feature.5  Feature.6  Feature.7  Feature.8  Feature.9  0   0.078236   0.064447   0.026550  -0.002592   0.040672  -0.009362
1   0.038334   0.073529  -0.072854   0.108111   0.015567  -0.046641
        Score
0  197.618500
1  155.439804
[2 rows x 22 columns]
if ok_csharpyml:
    obs = conc_test[:1]
    memo_time.append(timeexe("csharpyml.predict", "pipe.predict(obs)", repeat=30, number=10))
Moyenne: 22.52 ms Ecart-type 5.92 ms (with 10 runs) in [16.32 ms, 38.24 ms]

Pour fonctionner, le module convertit le dataframe en dataframe C#. On enlève ce coût pour être plus proche du temps de calcul lié à la fonction C#.

if ok_csharpyml:
    from csharpyml.binaries import CSDataFrame
    obs = CSDataFrame.read_df(conc_test[:1])
    memo_time.append(timeexe("csharpyml.predict-C#", "pipe.predict(obs)"))
Moyenne: 20.45 µs Ecart-type 7.31 µs (with 100 runs) in [11.36 µs, 34.37 µs]

Récapitulatif

import pandas
df = pandas.DataFrame(data=memo_time)
df = df.set_index("legend").sort_values("average")
df
average code deviation first first3 last3 max5 min5 repeat run
legend
cffi-linreg-custom-float32-simd 5.509177e-07 linreg_custom_float_simd(cptr_x, cptr_out) 3.804967e-07 0.000001 8.678167e-07 3.608233e-07 8.138300e-07 3.555500e-07 1000 100
cffi-linreg-custom 6.945190e-07 linreg_custom(cptr_x, cptr_out) 5.929634e-07 0.000003 2.864197e-06 6.834600e-07 1.193090e-06 3.950600e-07 1000 100
cffi-linreg 7.946076e-07 lib.linreg(n, cptr_x, cptr_coef, clr.intercept... 4.174084e-07 0.000003 2.216297e-06 8.928400e-07 1.374820e-06 4.622300e-07 1000 100
cffi-linreg-custom-float32 1.122592e-06 linreg_custom_float(cptr_x, cptr_out) 1.166959e-06 0.000024 9.311607e-06 6.966267e-07 2.342720e-06 6.913600e-07 1000 100
numba-linreg-type-float32 1.190092e-06 predict_clr_numba_cast_float(x32, c32, i32) 6.203381e-07 0.000001 1.164117e-06 1.984530e-06 2.176790e-06 6.281500e-07 1000 100
numba-linreg-type-custom 1.587967e-06 predict_clr_numba_cast_custom(x[0]) 7.062380e-07 0.000001 9.204900e-07 1.351110e-06 2.927400e-06 8.138300e-07 1000 100
numba-linreg-type 1.730193e-06 predict_clr_numba_cast(x[0], clr.coef_, clr.in... 1.246255e-06 0.000005 4.196877e-06 1.569717e-06 3.492350e-06 8.414800e-07 1000 100
cython-linreg-type 2.684370e-06 predict_clr_cython_type(x[0], clr.coef_, clr.i... 8.923258e-07 0.000002 2.409877e-06 2.812843e-06 3.958520e-06 1.572350e-06 1000 100
python-linreg-custom 5.198307e-06 predict_clr_python(z) 2.358929e-06 0.000006 6.716053e-06 2.392760e-06 9.220750e-06 2.386170e-06 1000 100
cython-linreg 5.289535e-06 predict_clr_cython(x[0], clr.coef_, clr.interc... 2.545080e-06 0.000005 4.107327e-06 4.374653e-06 8.987660e-06 2.615300e-06 1000 100
python-linreg 8.126214e-06 predict_clr_python_loop(z, coef, intercept) 3.229416e-06 0.000006 5.899590e-06 6.951773e-06 1.360988e-05 4.152100e-06 1000 100
cffi-linreg-custom wrapped 9.714181e-06 predict_clr_custom(x) 2.861559e-06 0.000025 2.240527e-05 9.818607e-06 1.622519e-05 5.878520e-06 1000 100
cffi-linreg-custom-float wrapped 1.226456e-05 predict_clr_custom(x32) 5.064189e-06 0.000015 1.491754e-05 1.953318e-05 2.258174e-05 6.637040e-06 1000 100
numba-linreg-type-numpy 1.251224e-05 predict_clr_numba_numpy(x[0], clr.coef_, clr.i... 5.142496e-06 0.000062 4.541236e-05 2.328758e-05 2.000988e-05 6.945180e-06 1000 100
numpy-linreg-numpy 1.541223e-05 predict_clr_numpy(z, coef, clr.intercept_) 4.782486e-06 0.000018 1.726157e-05 1.396148e-05 2.464001e-05 8.379260e-06 1000 100
cffi-linreg-wrapped 1.610619e-05 predict_clr(x, clr) 6.783167e-06 0.000020 2.203919e-05 1.371128e-05 2.762273e-05 8.438520e-06 1000 100
csharpyml.predict-C# 2.045361e-05 pipe.predict(obs) 7.309846e-06 0.000047 3.968397e-05 1.500313e-05 3.436644e-05 1.136198e-05 1000 100
onnxruntime-float32 2.625491e-05 predict_onnxrt(x32) 8.793059e-06 0.000018 2.664166e-05 2.437664e-05 4.212545e-05 1.610668e-05 1000 100
onnxruntime-float64 3.246068e-05 predict_onnxrt(x.astype(numpy.float32)) 7.862068e-06 0.000033 3.339590e-05 4.624199e-05 4.851360e-05 2.305581e-05 1000 100
numba-linreg-notype 4.739361e-05 predict_clr_numba(z, clr.coef_, clr.intercept_) 1.267786e-05 0.000062 6.863147e-05 3.973269e-05 7.324448e-05 3.373434e-05 1000 100
sklearn.predict 6.097874e-05 clr.predict(z) 1.576137e-05 0.000061 7.176562e-05 7.813797e-05 9.297386e-05 4.442076e-05 1000 100
csharpyml.predict 2.251525e-02 pipe.predict(obs) 5.917796e-03 0.024580 2.933990e-02 1.774908e-02 3.824456e-02 1.632412e-02 30 10
nimbusml.predict 4.612751e-01 nlr.predict(x) 5.971978e-02 0.404017 4.887386e-01 4.658451e-01 5.573754e-01 4.040165e-01 5 5

On enlève quelques colonnes et on rappelle :

  • cffi: signifie optimisé avec cffi
  • custom: pas de boucle mais la fonction ne peut prédire qu’une seule régression linéaire
  • float32: utilise des float et non des double
  • linreg: régression linéaire
  • numba: optimisation avec numba
  • numpy: optimisation avec numpy
  • python: pas de C, que du python
  • simd: optimisé avec les instructions SIMD
  • sklearn: fonction sklearn.predict
  • static: la fonction utilise des variables statiques
  • type: la fonction est typée et ne fonctionne qu’avec un type précis en entrée.
  • wrapped: code optimisé mais embabllé dans une fonction Python qui elle ne l’est pas (les containers sont recréés à chaque fois)
cols = ["average", "deviation", "min5", "max5", "run", "code"]
df[cols]
average deviation min5 max5 run code
legend
cffi-linreg-custom-float32-simd 5.509177e-07 3.804967e-07 3.555500e-07 8.138300e-07 100 linreg_custom_float_simd(cptr_x, cptr_out)
cffi-linreg-custom 6.945190e-07 5.929634e-07 3.950600e-07 1.193090e-06 100 linreg_custom(cptr_x, cptr_out)
cffi-linreg 7.946076e-07 4.174084e-07 4.622300e-07 1.374820e-06 100 lib.linreg(n, cptr_x, cptr_coef, clr.intercept...
cffi-linreg-custom-float32 1.122592e-06 1.166959e-06 6.913600e-07 2.342720e-06 100 linreg_custom_float(cptr_x, cptr_out)
numba-linreg-type-float32 1.190092e-06 6.203381e-07 6.281500e-07 2.176790e-06 100 predict_clr_numba_cast_float(x32, c32, i32)
numba-linreg-type-custom 1.587967e-06 7.062380e-07 8.138300e-07 2.927400e-06 100 predict_clr_numba_cast_custom(x[0])
numba-linreg-type 1.730193e-06 1.246255e-06 8.414800e-07 3.492350e-06 100 predict_clr_numba_cast(x[0], clr.coef_, clr.in...
cython-linreg-type 2.684370e-06 8.923258e-07 1.572350e-06 3.958520e-06 100 predict_clr_cython_type(x[0], clr.coef_, clr.i...
python-linreg-custom 5.198307e-06 2.358929e-06 2.386170e-06 9.220750e-06 100 predict_clr_python(z)
cython-linreg 5.289535e-06 2.545080e-06 2.615300e-06 8.987660e-06 100 predict_clr_cython(x[0], clr.coef_, clr.interc...
python-linreg 8.126214e-06 3.229416e-06 4.152100e-06 1.360988e-05 100 predict_clr_python_loop(z, coef, intercept)
cffi-linreg-custom wrapped 9.714181e-06 2.861559e-06 5.878520e-06 1.622519e-05 100 predict_clr_custom(x)
cffi-linreg-custom-float wrapped 1.226456e-05 5.064189e-06 6.637040e-06 2.258174e-05 100 predict_clr_custom(x32)
numba-linreg-type-numpy 1.251224e-05 5.142496e-06 6.945180e-06 2.000988e-05 100 predict_clr_numba_numpy(x[0], clr.coef_, clr.i...
numpy-linreg-numpy 1.541223e-05 4.782486e-06 8.379260e-06 2.464001e-05 100 predict_clr_numpy(z, coef, clr.intercept_)
cffi-linreg-wrapped 1.610619e-05 6.783167e-06 8.438520e-06 2.762273e-05 100 predict_clr(x, clr)
csharpyml.predict-C# 2.045361e-05 7.309846e-06 1.136198e-05 3.436644e-05 100 pipe.predict(obs)
onnxruntime-float32 2.625491e-05 8.793059e-06 1.610668e-05 4.212545e-05 100 predict_onnxrt(x32)
onnxruntime-float64 3.246068e-05 7.862068e-06 2.305581e-05 4.851360e-05 100 predict_onnxrt(x.astype(numpy.float32))
numba-linreg-notype 4.739361e-05 1.267786e-05 3.373434e-05 7.324448e-05 100 predict_clr_numba(z, clr.coef_, clr.intercept_)
sklearn.predict 6.097874e-05 1.576137e-05 4.442076e-05 9.297386e-05 100 clr.predict(z)
csharpyml.predict 2.251525e-02 5.917796e-03 1.632412e-02 3.824456e-02 10 pipe.predict(obs)
nimbusml.predict 4.612751e-01 5.971978e-02 4.040165e-01 5.573754e-01 5 nlr.predict(x)
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(14,6))
df[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                  legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
../_images/cffi_linear_regression_124_0.png

Il manque à ce comparatif le GPU mais c’est un peu plus complexe à mettre en oeuvre, il faut une carte GPU et la parallélisation n’apporterait pas énormément compte tenu de la faible dimension du problème.

Prédiction one-off et biais de mesure

Le graphique précédent montre que la fonction predict de scikit-learn est la plus lente. La première raison est que ce code est valable pour toutes les régresssions linéaires alors que toutes les autres fonctions sont spécialisées pour un seul modèle. La seconde raison est que le code de scikit-learn est optimisé pour le calcul de plusieurs prédictions à la fois alors que toutes les autres fonctions n’en calcule qu’une seule (scénario dit one-off). On compare à ce que donnerait unev version purement python et numpy.

def predict_clr_python_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.zeros((x.shape[0], 1))
    res[:, 0] = intercept
    for i in range(0, x.shape[0]):
        res[i, 0] += sum(a*b for a, b in zip(x[i, :], coef))
    return res

predict_clr_python_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[197.61846908],
       [155.43979328]])
def predict_clr_numpy_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.ones((x.shape[0], 1)) * intercept
    res += x @ coef.reshape((len(coef), 1))
    return res

predict_clr_numpy_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[197.61846908],
       [155.43979328]])
def predict_clr_numba_cast_multi(X, coef, intercept):
    return [predict_clr_numba_cast(x, coef, intercept) for x in X]

predict_clr_numba_cast_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[197.61846907503298, 155.43979327521237]
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
for i in batch:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)
    repeat=20 if i >= 5000 else 100

    memo.append(timeexe("sklearn.predict %d" % i, "clr.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if ok_nimbus:
        memo.append(timeexe("nimbus.predict %d" % i, "nlr.predict(mx)",
                            repeat=2, number=2))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "nimbus"

    if i <= 1000:
        # très lent
        memo.append(timeexe("python %d" % i, "predict_clr_python_loop_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=20, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "python"

    memo.append(timeexe("numpy %d" % i, "predict_clr_numpy_loop_multi(mx, clr.coef_, clr.intercept_)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "numpy"

    if i <= 10000:
        # très lent
        memo.append(timeexe("numba %d" % i, "predict_clr_numba_cast_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "numba"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 66.11 µs Ecart-type 55.35 µs (with 10 runs) in [29.47 µs, 118.60 µs]
Moyenne: 592.30 ms Ecart-type 13.63 ms (with 2 runs) in [578.67 ms, 605.93 ms]
Moyenne: 15.48 µs Ecart-type 822.30 ns (with 10 runs) in [15.09 µs, 18.69 µs]
Moyenne: 19.65 µs Ecart-type 8.93 µs (with 10 runs) in [13.59 µs, 38.72 µs]
Moyenne: 4.86 µs Ecart-type 1.56 µs (with 10 runs) in [4.31 µs, 8.89 µs]
Moyenne: 36.93 µs Ecart-type 12.37 µs (with 10 runs) in [32.47 µs, 58.71 µs]
batch = 10
Moyenne: 65.85 µs Ecart-type 15.85 µs (with 10 runs) in [59.34 µs, 102.76 µs]
Moyenne: 418.67 ms Ecart-type 399.41 µs (with 2 runs) in [418.27 ms, 419.07 ms]
Moyenne: 130.64 µs Ecart-type 24.81 µs (with 10 runs) in [113.11 µs, 208.91 µs]
Moyenne: 15.95 µs Ecart-type 9.21 µs (with 10 runs) in [12.40 µs, 40.77 µs]
Moyenne: 19.89 µs Ecart-type 5.00 µs (with 10 runs) in [18.13 µs, 33.66 µs]
Moyenne: 36.74 µs Ecart-type 12.61 µs (with 10 runs) in [29.67 µs, 65.30 µs]
batch = 100
Moyenne: 55.08 µs Ecart-type 29.04 µs (with 10 runs) in [31.17 µs, 113.98 µs]
Moyenne: 469.03 ms Ecart-type 49.26 ms (with 2 runs) in [419.78 ms, 518.29 ms]
Moyenne: 1.29 ms Ecart-type 171.10 µs (with 10 runs) in [1.07 ms, 1.73 ms]
Moyenne: 16.05 µs Ecart-type 12.06 µs (with 10 runs) in [8.57 µs, 25.40 µs]
Moyenne: 165.51 µs Ecart-type 34.63 µs (with 10 runs) in [101.14 µs, 253.75 µs]
Moyenne: 41.49 µs Ecart-type 16.13 µs (with 10 runs) in [22.68 µs, 72.69 µs]
batch = 200
Moyenne: 69.74 µs Ecart-type 34.22 µs (with 10 runs) in [34.65 µs, 152.97 µs]
Moyenne: 536.49 ms Ecart-type 45.92 ms (with 2 runs) in [490.56 ms, 582.41 ms]
Moyenne: 2.30 ms Ecart-type 110.87 µs (with 10 runs) in [2.14 ms, 2.53 ms]
Moyenne: 23.68 µs Ecart-type 15.38 µs (with 10 runs) in [9.52 µs, 59.26 µs]
Moyenne: 362.40 µs Ecart-type 89.71 µs (with 10 runs) in [267.22 µs, 563.28 µs]
Moyenne: 49.71 µs Ecart-type 15.31 µs (with 10 runs) in [40.93 µs, 85.85 µs]
batch = 500
Moyenne: 77.50 µs Ecart-type 29.02 µs (with 10 runs) in [65.15 µs, 122.79 µs]
Moyenne: 406.04 ms Ecart-type 5.91 ms (with 2 runs) in [400.13 ms, 411.95 ms]
Moyenne: 5.72 ms Ecart-type 258.95 µs (with 10 runs) in [5.28 ms, 6.16 ms]
Moyenne: 19.57 µs Ecart-type 8.24 µs (with 10 runs) in [11.14 µs, 35.71 µs]
Moyenne: 919.31 µs Ecart-type 202.77 µs (with 10 runs) in [701.87 µs, 1.30 ms]
Moyenne: 56.63 µs Ecart-type 14.92 µs (with 10 runs) in [35.28 µs, 83.24 µs]
batch = 1000
Moyenne: 93.47 µs Ecart-type 38.36 µs (with 10 runs) in [48.63 µs, 131.63 µs]
Moyenne: 575.11 ms Ecart-type 183.62 ms (with 2 runs) in [391.49 ms, 758.74 ms]
Moyenne: 11.68 ms Ecart-type 842.74 µs (with 10 runs) in [10.65 ms, 13.72 ms]
Moyenne: 28.35 µs Ecart-type 11.67 µs (with 10 runs) in [25.56 µs, 38.91 µs]
Moyenne: 1.82 ms Ecart-type 341.89 µs (with 10 runs) in [1.40 ms, 2.52 ms]
Moyenne: 87.00 µs Ecart-type 13.41 µs (with 10 runs) in [84.66 µs, 123.69 µs]
batch = 2000
Moyenne: 116.13 µs Ecart-type 37.31 µs (with 10 runs) in [59.69 µs, 172.01 µs]
Moyenne: 408.42 ms Ecart-type 9.74 ms (with 2 runs) in [398.67 ms, 418.16 ms]
Moyenne: 40.98 µs Ecart-type 12.62 µs (with 10 runs) in [31.01 µs, 59.50 µs]
Moyenne: 3.32 ms Ecart-type 460.19 µs (with 10 runs) in [2.82 ms, 4.48 ms]
Moyenne: 142.49 µs Ecart-type 32.35 µs (with 10 runs) in [88.22 µs, 243.52 µs]
batch = 3000
Moyenne: 131.44 µs Ecart-type 41.20 µs (with 10 runs) in [64.71 µs, 181.45 µs]
Moyenne: 409.94 ms Ecart-type 14.04 ms (with 2 runs) in [395.90 ms, 423.98 ms]
Moyenne: 39.85 µs Ecart-type 13.62 µs (with 10 runs) in [22.99 µs, 71.94 µs]
Moyenne: 5.21 ms Ecart-type 770.62 µs (with 10 runs) in [4.38 ms, 6.73 ms]
Moyenne: 231.05 µs Ecart-type 55.42 µs (with 10 runs) in [189.31 µs, 329.01 µs]
batch = 4000
Moyenne: 150.87 µs Ecart-type 41.17 µs (with 10 runs) in [121.21 µs, 215.74 µs]
Moyenne: 408.31 ms Ecart-type 17.37 ms (with 2 runs) in [390.94 ms, 425.69 ms]
Moyenne: 46.04 µs Ecart-type 15.51 µs (with 10 runs) in [26.43 µs, 76.01 µs]
Moyenne: 6.74 ms Ecart-type 1.52 ms (with 10 runs) in [4.88 ms, 10.21 ms]
Moyenne: 250.30 µs Ecart-type 74.88 µs (with 10 runs) in [155.77 µs, 407.78 µs]
batch = 5000
Moyenne: 175.07 µs Ecart-type 50.92 µs (with 10 runs) in [112.87 µs, 332.84 µs]
Moyenne: 400.47 ms Ecart-type 3.32 ms (with 2 runs) in [397.16 ms, 403.79 ms]
Moyenne: 54.68 µs Ecart-type 15.54 µs (with 10 runs) in [48.04 µs, 113.26 µs]
Moyenne: 8.52 ms Ecart-type 1.52 ms (with 10 runs) in [7.12 ms, 12.84 ms]
Moyenne: 287.25 µs Ecart-type 71.18 µs (with 10 runs) in [173.12 µs, 441.36 µs]
batch = 10000
Moyenne: 245.55 µs Ecart-type 71.10 µs (with 10 runs) in [160.24 µs, 397.55 µs]
Moyenne: 431.50 ms Ecart-type 7.84 ms (with 2 runs) in [423.65 ms, 439.34 ms]
Moyenne: 90.02 µs Ecart-type 46.60 µs (with 10 runs) in [50.09 µs, 234.55 µs]
Moyenne: 16.31 ms Ecart-type 1.05 ms (with 10 runs) in [15.22 ms, 19.15 ms]
Moyenne: 590.63 µs Ecart-type 204.07 µs (with 10 runs) in [380.05 µs, 1.29 ms]
batch = 20000
Moyenne: 527.64 µs Ecart-type 180.13 µs (with 10 runs) in [320.28 µs, 1.06 ms]
Moyenne: 438.45 ms Ecart-type 746.57 µs (with 2 runs) in [437.71 ms, 439.20 ms]
Moyenne: 175.81 µs Ecart-type 63.59 µs (with 10 runs) in [92.88 µs, 306.17 µs]
Moyenne: 1.35 ms Ecart-type 299.42 µs (with 10 runs) in [1.08 ms, 1.96 ms]
batch = 50000
Moyenne: 1.80 ms Ecart-type 751.18 µs (with 10 runs) in [1.33 ms, 4.90 ms]
Moyenne: 443.91 ms Ecart-type 10.87 ms (with 2 runs) in [433.03 ms, 454.78 ms]
Moyenne: 435.21 µs Ecart-type 83.65 µs (with 10 runs) in [316.41 µs, 585.88 µs]
Moyenne: 5.64 ms Ecart-type 1.05 ms (with 10 runs) in [4.36 ms, 7.91 ms]
batch = 75000
Moyenne: 2.07 ms Ecart-type 351.14 µs (with 10 runs) in [1.50 ms, 2.80 ms]
Moyenne: 453.49 ms Ecart-type 1.56 ms (with 2 runs) in [451.93 ms, 455.04 ms]
Moyenne: 586.91 µs Ecart-type 71.69 µs (with 10 runs) in [516.62 µs, 757.61 µs]
Moyenne: 8.94 ms Ecart-type 1.04 ms (with 10 runs) in [7.65 ms, 11.38 ms]
batch = 100000
Moyenne: 2.57 ms Ecart-type 223.69 µs (with 10 runs) in [2.33 ms, 3.28 ms]
Moyenne: 507.15 ms Ecart-type 26.96 ms (with 2 runs) in [480.19 ms, 534.12 ms]
Moyenne: 802.65 µs Ecart-type 134.98 µs (with 10 runs) in [580.50 µs, 1.05 ms]
Moyenne: 12.54 ms Ecart-type 1.11 ms (with 10 runs) in [10.65 ms, 15.22 ms]
batch = 150000
Moyenne: 6.53 ms Ecart-type 402.31 µs (with 10 runs) in [6.06 ms, 7.23 ms]
Moyenne: 531.57 ms Ecart-type 9.46 ms (with 2 runs) in [522.11 ms, 541.03 ms]
Moyenne: 4.98 ms Ecart-type 388.77 µs (with 10 runs) in [4.38 ms, 5.72 ms]
Moyenne: 18.44 ms Ecart-type 1.45 ms (with 10 runs) in [16.56 ms, 21.21 ms]
batch = 200000
Moyenne: 8.59 ms Ecart-type 636.86 µs (with 10 runs) in [7.51 ms, 9.85 ms]
Moyenne: 613.15 ms Ecart-type 2.69 ms (with 2 runs) in [610.46 ms, 615.84 ms]
Moyenne: 6.45 ms Ecart-type 542.18 µs (with 10 runs) in [5.74 ms, 7.68 ms]
Moyenne: 25.32 ms Ecart-type 3.26 ms (with 10 runs) in [22.83 ms, 35.46 ms]
batch = 300000
Moyenne: 11.70 ms Ecart-type 1.57 ms (with 10 runs) in [9.68 ms, 14.26 ms]
Moyenne: 849.83 ms Ecart-type 22.12 ms (with 2 runs) in [827.71 ms, 871.96 ms]
Moyenne: 9.00 ms Ecart-type 911.11 µs (with 10 runs) in [8.07 ms, 11.53 ms]
Moyenne: 38.63 ms Ecart-type 3.44 ms (with 10 runs) in [31.65 ms, 44.07 ms]
batch = 400000
Moyenne: 15.67 ms Ecart-type 861.19 µs (with 10 runs) in [14.59 ms, 17.50 ms]
Moyenne: 1.07 s Ecart-type 18.64 ms (with 2 runs) in [1.05 s, 1.09 s]
Moyenne: 12.73 ms Ecart-type 1.45 ms (with 10 runs) in [9.61 ms, 14.14 ms]
Moyenne: 51.42 ms Ecart-type 6.55 ms (with 10 runs) in [41.90 ms, 62.90 ms]
batch = 500000
Moyenne: 19.82 ms Ecart-type 2.00 ms (with 10 runs) in [16.57 ms, 23.15 ms]
Moyenne: 1.26 s Ecart-type 6.61 ms (with 2 runs) in [1.25 s, 1.26 s]
Moyenne: 16.03 ms Ecart-type 1.02 ms (with 10 runs) in [14.14 ms, 17.45 ms]
Moyenne: 66.45 ms Ecart-type 5.87 ms (with 10 runs) in [59.36 ms, 79.10 ms]
batch = 600000
Moyenne: 25.01 ms Ecart-type 1.03 ms (with 10 runs) in [23.19 ms, 26.39 ms]
Moyenne: 1.54 s Ecart-type 22.09 ms (with 2 runs) in [1.52 s, 1.57 s]
Moyenne: 19.92 ms Ecart-type 1.21 ms (with 10 runs) in [18.89 ms, 21.52 ms]
Moyenne: 76.00 ms Ecart-type 10.64 ms (with 10 runs) in [60.07 ms, 96.82 ms]
dfb = pandas.DataFrame(memo)[["average", "lib", "batch"]]
piv = dfb.pivot("batch", "lib", "average")
piv
lib nimbus numba numpy onnxruntime python sklearn
batch
1 0.592297 0.000005 0.000020 0.000037 0.000015 0.000066
10 0.418668 0.000020 0.000016 0.000037 0.000131 0.000066
100 0.469035 0.000166 0.000016 0.000041 0.001286 0.000055
200 0.536486 0.000362 0.000024 0.000050 0.002297 0.000070
500 0.406039 0.000919 0.000020 0.000057 0.005716 0.000078
1000 0.575115 0.001820 0.000028 0.000087 0.011675 0.000093
2000 0.408418 0.003323 0.000041 0.000142 NaN 0.000116
3000 0.409942 0.005206 0.000040 0.000231 NaN 0.000131
4000 0.408313 0.006739 0.000046 0.000250 NaN 0.000151
5000 0.400474 0.008520 0.000055 0.000287 NaN 0.000175
10000 0.431496 0.016312 0.000090 0.000591 NaN 0.000246
20000 0.438452 NaN 0.000176 0.001346 NaN 0.000528
50000 0.443906 NaN 0.000435 0.005636 NaN 0.001803
75000 0.453490 NaN 0.000587 0.008943 NaN 0.002069
100000 0.507152 NaN 0.000803 0.012535 NaN 0.002571
150000 0.531567 NaN 0.004980 0.018435 NaN 0.006531
200000 0.613150 NaN 0.006451 0.025321 NaN 0.008589
300000 0.849833 NaN 0.009000 0.038631 NaN 0.011699
400000 1.072258 NaN 0.012731 0.051418 NaN 0.015670
500000 1.257586 NaN 0.016030 0.066452 NaN 0.019817
600000 1.543260 NaN 0.019922 0.076000 NaN 0.025006
for c in piv.columns:
    piv["ave_" + c] = piv[c] / piv.index
piv
lib nimbus numba numpy onnxruntime python sklearn ave_nimbus ave_numba ave_numpy ave_onnxruntime ave_python ave_sklearn
batch
1 0.592297 0.000005 0.000020 0.000037 0.000015 0.000066 0.592297 0.000005 1.964997e-05 3.693276e-05 0.000015 6.610610e-05
10 0.418668 0.000020 0.000016 0.000037 0.000131 0.000066 0.041867 0.000002 1.594786e-06 3.674391e-06 0.000013 6.584654e-06
100 0.469035 0.000166 0.000016 0.000041 0.001286 0.000055 0.004690 0.000002 1.604505e-07 4.149494e-07 0.000013 5.507597e-07
200 0.536486 0.000362 0.000024 0.000050 0.002297 0.000070 0.002682 0.000002 1.183823e-07 2.485275e-07 0.000011 3.486915e-07
500 0.406039 0.000919 0.000020 0.000057 0.005716 0.000078 0.000812 0.000002 3.914826e-08 1.132690e-07 0.000011 1.550096e-07
1000 0.575115 0.001820 0.000028 0.000087 0.011675 0.000093 0.000575 0.000002 2.834648e-08 8.700013e-08 0.000012 9.346848e-08
2000 0.408418 0.003323 0.000041 0.000142 NaN 0.000116 0.000204 0.000002 2.049028e-08 7.124368e-08 NaN 5.806521e-08
3000 0.409942 0.005206 0.000040 0.000231 NaN 0.000131 0.000137 0.000002 1.328474e-08 7.701560e-08 NaN 4.381170e-08
4000 0.408313 0.006739 0.000046 0.000250 NaN 0.000151 0.000102 0.000002 1.150973e-08 6.257395e-08 NaN 3.771745e-08
5000 0.400474 0.008520 0.000055 0.000287 NaN 0.000175 0.000080 0.000002 1.093650e-08 5.745029e-08 NaN 3.501355e-08
10000 0.431496 0.016312 0.000090 0.000591 NaN 0.000246 0.000043 0.000002 9.002472e-09 5.906294e-08 NaN 2.455487e-08
20000 0.438452 NaN 0.000176 0.001346 NaN 0.000528 0.000022 NaN 8.790622e-09 6.730423e-08 NaN 2.638184e-08
50000 0.443906 NaN 0.000435 0.005636 NaN 0.001803 0.000009 NaN 8.704280e-09 1.127115e-07 NaN 3.605019e-08
75000 0.453490 NaN 0.000587 0.008943 NaN 0.002069 0.000006 NaN 7.825438e-09 1.192387e-07 NaN 2.758325e-08
100000 0.507152 NaN 0.000803 0.012535 NaN 0.002571 0.000005 NaN 8.026492e-09 1.253538e-07 NaN 2.570573e-08
150000 0.531567 NaN 0.004980 0.018435 NaN 0.006531 0.000004 NaN 3.319770e-08 1.229013e-07 NaN 4.353777e-08
200000 0.613150 NaN 0.006451 0.025321 NaN 0.008589 0.000003 NaN 3.225735e-08 1.266035e-07 NaN 4.294423e-08
300000 0.849833 NaN 0.009000 0.038631 NaN 0.011699 0.000003 NaN 2.999960e-08 1.287707e-07 NaN 3.899577e-08
400000 1.072258 NaN 0.012731 0.051418 NaN 0.015670 0.000003 NaN 3.182658e-08 1.285457e-07 NaN 3.917440e-08
500000 1.257586 NaN 0.016030 0.066452 NaN 0.019817 0.000003 NaN 3.206050e-08 1.329033e-07 NaN 3.963485e-08
600000 1.543260 NaN 0.019922 0.076000 NaN 0.025006 0.000003 NaN 3.320313e-08 1.266671e-07 NaN 4.167692e-08
libs = list(c for c in piv.columns if "ave_" in c)
ax = piv.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch")
ax.grid(True);
../_images/cffi_linear_regression_133_0.png

Le minimum obtenu est pour 10^{-8} s soit 10 ns. Cela montre que la comparaisson précédente était incomplète voire biaisée. Tout dépend de l’usage qu’on fait de la fonction de prédiction même s’il sera toujours possible de d’écrire un code spécialisé plus rapide que toute autre fonction générique. En général, plus on reste du côté Python, plus le programme est lent. Le nombre de passage de l’un à l’autre, selon la façon dont il est fait ralenti aussi. En tenant compte de cela, le programme rouge sera plus lent que le vert.

from pyquickhelper.helpgen import NbImage
NbImage("pycpp.png")
../_images/cffi_linear_regression_135_0.png

Ces résultats sont d’une façon générale assez volatile car le temps de calcul est enrobé dans plusieurs fonctions Python qui rendent une mesure précise difficile. Il reste néanmoins une bonne idée des ordres de grandeurs.

Random Forest

On reproduit les mêmes résultats pour une random forest mais la réécriture n’est plus aussi simple qu’une régression linéaire.

Une prédiction à la fois

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(diabetes_X_train, diabetes_y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
memo_time = []
x = diabetes_X_test[:1]
memo_time.append(timeexe("sklearn-rf", "rf.predict(x)", repeat=100, number=20))
Moyenne: 923.47 µs Ecart-type 397.55 µs (with 20 runs) in [600.06 µs, 1.71 ms]

C’est beaucoup plus long que la régression linéaire. On essaye avec onnx.

if ok_onnx:
    onnxrf_model = convert_sklearn(rf, 'model', [('input', FloatTensorType([1, clr.coef_.shape[0]]))])
    save_model(onnxrf_model, 'model_rf.onnx')
    model_onnx = onnx.load('model_rf.onnx')
if ok_onnx:
    sess = onnxruntime.InferenceSession("model_rf.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt_rf(x):
        return sess.run(["variable"], {'input': x})

    print(predict_onnxrt_rf(x.astype(numpy.float32)))
    memo_time.append(timeexe("onnx-rf", "predict_onnxrt_rf(x.astype(numpy.float32))", repeat=100, number=20))
Input: NodeArg(name='input', type='tensor(float)', shape=[1, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[1, 1])
[array([[210.1]], dtype=float32)]
Moyenne: 33.95 µs Ecart-type 11.69 µs (with 20 runs) in [18.79 µs, 63.55 µs]

C’est beaucoup plus rapide.

if ok_nimbus:
    from nimbusml.ensemble import FastTreesRegressor
    nrf = FastTreesRegressor(num_trees=10)
    nrf.fit(diabetes_X_train, diabetes_y_train)
Not adding a normalizer.
Making per-feature arrays
Changing data from row-wise to column-wise
Processed 422 instances
Binning and forming Feature objects
Reserved memory for tree learner: 170508 bytes
Starting to train ...
Not training a calibrator because it is not needed.
Elapsed time: 00:00:00.9738875
if ok_nimbus:
    memo_time.append(timeexe("nimbus-rf", "nrf.predict(x)", repeat=10, number=2))
Moyenne: 342.11 ms Ecart-type 47.06 ms (with 2 runs) in [299.55 ms, 472.53 ms]

Avec csharpyml.

if ok_csharpyml:
    piperf = CSPipeline([concatation], "ft{iter=10}")
    piperf.fit(conc, feature="Feature", label="Label")
if ok_csharpyml:
    x2 = conc_test[:1]
    memo_time.append(timeexe("csharpyml-rf", "piperf.predict(x2)", repeat=10, number=20))
Moyenne: 23.10 ms Ecart-type 2.29 ms (with 20 runs) in [20.16 ms, 27.47 ms]
import pandas
df2 = pandas.DataFrame(data=memo_time)
df2 = df2.set_index("legend").sort_values("average")
df2
average code deviation first first3 last3 max5 min5 repeat run
legend
onnx-rf 0.000034 predict_onnxrt_rf(x.astype(numpy.float32)) 0.000012 0.000077 0.000058 0.000028 0.000064 0.000019 100 20
sklearn-rf 0.000923 rf.predict(x) 0.000398 0.000892 0.000927 0.000847 0.001708 0.000600 100 20
csharpyml-rf 0.023101 piperf.predict(x2) 0.002286 0.027468 0.022702 0.021971 0.027468 0.020159 10 20
nimbus-rf 0.342109 nrf.predict(x) 0.047062 0.472526 0.379317 0.324538 0.472526 0.299553 10 2
fig, ax = plt.subplots(1, 1, figsize=(14,4))
df2[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                   legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
../_images/cffi_linear_regression_152_0.png

Prédiction en batch

if ok_csharpyml:
    def predict_csharpyml_rf(X):
        dft = DataFrame(X.astype(numpy.float32), columns=["F%d" % i for i in range(0, X.shape[1])])
        dft["Label"] = numpy.zeros((dft.shape[0],), dtype=numpy.float32)
        return piperf.predict(dft)

    print(predict_csharpyml_rf(diabetes_X_test[:1]))
         F0       F1        F2        F3        F4        F5       F6  0 -0.078165  0.05068  0.077863  0.052858  0.078236  0.064447  0.02655
         F7        F8        F9     ...       Feature.3  Feature.4  Feature.5  0 -0.002592  0.040672 -0.009362     ...        0.052858   0.078236   0.064447
   Feature.6  Feature.7  Feature.8  Feature.9  PredictedLabel     Score  0    0.02655  -0.002592   0.040672  -0.009362            True  6.737847
   Probability
0     0.936739
[1 rows x 24 columns]
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
repeat = 10
for i in batch[:15]:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)

    memo.append(timeexe("sklearn.predict %d" % i, "rf.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if ok_nimbus:
        memo.append(timeexe("nimbus.predict %d" % i, "nrf.predict(mx)",
                            repeat=2, number=2))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "nimbus"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt_rf(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"

    if ok_csharpyml and i <= 10000:
        memo.append(timeexe("csharpyml %d" % i, "predict_csharpyml_rf(mx.astype(numpy.float32))",
                            repeat=2, number=2))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "csharpyml"
batch = 1
Moyenne: 848.40 µs Ecart-type 581.56 µs (with 10 runs) in [464.71 µs, 2.09 ms]
Moyenne: 325.06 ms Ecart-type 6.14 ms (with 2 runs) in [318.92 ms, 331.20 ms]
Moyenne: 47.06 µs Ecart-type 14.17 µs (with 10 runs) in [38.99 µs, 75.50 µs]
Moyenne: 20.93 ms Ecart-type 99.16 µs (with 2 runs) in [20.83 ms, 21.02 ms]
batch = 10
Moyenne: 941.97 µs Ecart-type 77.09 µs (with 10 runs) in [802.41 µs, 1.11 ms]
Moyenne: 320.64 ms Ecart-type 7.00 ms (with 2 runs) in [313.63 ms, 327.64 ms]
Moyenne: 57.75 µs Ecart-type 13.35 µs (with 10 runs) in [43.14 µs, 80.71 µs]
Moyenne: 24.82 ms Ecart-type 518.12 µs (with 2 runs) in [24.30 ms, 25.33 ms]
batch = 100
Moyenne: 1.52 ms Ecart-type 577.55 µs (with 10 runs) in [799.49 µs, 2.95 ms]
Moyenne: 348.16 ms Ecart-type 31.74 ms (with 2 runs) in [316.42 ms, 379.89 ms]
Moyenne: 546.66 µs Ecart-type 71.10 µs (with 10 runs) in [468.23 µs, 688.71 µs]
Moyenne: 46.86 ms Ecart-type 58.17 µs (with 2 runs) in [46.80 ms, 46.92 ms]
batch = 200
Moyenne: 1.04 ms Ecart-type 193.15 µs (with 10 runs) in [733.59 µs, 1.44 ms]
Moyenne: 349.80 ms Ecart-type 12.77 ms (with 2 runs) in [337.03 ms, 362.57 ms]
Moyenne: 835.22 µs Ecart-type 120.94 µs (with 10 runs) in [669.16 µs, 1.05 ms]
Moyenne: 52.39 ms Ecart-type 2.55 ms (with 2 runs) in [49.84 ms, 54.95 ms]
batch = 500
Moyenne: 1.22 ms Ecart-type 268.18 µs (with 10 runs) in [795.34 µs, 1.65 ms]
Moyenne: 375.55 ms Ecart-type 51.64 ms (with 2 runs) in [323.92 ms, 427.19 ms]
Moyenne: 2.52 ms Ecart-type 197.90 µs (with 10 runs) in [2.25 ms, 2.92 ms]
Moyenne: 92.29 ms Ecart-type 10.98 ms (with 2 runs) in [81.31 ms, 103.26 ms]
batch = 1000
Moyenne: 1.60 ms Ecart-type 143.34 µs (with 10 runs) in [1.35 ms, 1.79 ms]
Moyenne: 430.68 ms Ecart-type 75.22 ms (with 2 runs) in [355.46 ms, 505.91 ms]
Moyenne: 4.46 ms Ecart-type 344.82 µs (with 10 runs) in [4.03 ms, 5.39 ms]
Moyenne: 170.25 ms Ecart-type 13.00 ms (with 2 runs) in [157.26 ms, 183.25 ms]
batch = 2000
Moyenne: 2.32 ms Ecart-type 260.86 µs (with 10 runs) in [1.73 ms, 2.82 ms]
Moyenne: 452.81 ms Ecart-type 65.83 ms (with 2 runs) in [386.99 ms, 518.64 ms]
Moyenne: 8.44 ms Ecart-type 497.05 µs (with 10 runs) in [7.51 ms, 9.03 ms]
Moyenne: 294.08 ms Ecart-type 8.07 ms (with 2 runs) in [286.01 ms, 302.15 ms]
batch = 3000
Moyenne: 3.56 ms Ecart-type 803.14 µs (with 10 runs) in [2.63 ms, 5.05 ms]
Moyenne: 381.12 ms Ecart-type 59.13 ms (with 2 runs) in [321.99 ms, 440.24 ms]
Moyenne: 11.79 ms Ecart-type 757.59 µs (with 10 runs) in [10.99 ms, 13.25 ms]
Moyenne: 452.81 ms Ecart-type 38.70 ms (with 2 runs) in [414.11 ms, 491.51 ms]
batch = 4000
Moyenne: 4.73 ms Ecart-type 388.83 µs (with 10 runs) in [4.12 ms, 5.39 ms]
Moyenne: 444.54 ms Ecart-type 42.27 ms (with 2 runs) in [402.27 ms, 486.82 ms]
Moyenne: 16.15 ms Ecart-type 2.58 ms (with 10 runs) in [10.97 ms, 20.56 ms]
Moyenne: 526.33 ms Ecart-type 31.14 ms (with 2 runs) in [495.19 ms, 557.47 ms]
batch = 5000
Moyenne: 7.80 ms Ecart-type 3.29 ms (with 10 runs) in [3.28 ms, 12.37 ms]
Moyenne: 491.09 ms Ecart-type 145.90 ms (with 2 runs) in [345.19 ms, 636.99 ms]
Moyenne: 20.67 ms Ecart-type 2.97 ms (with 10 runs) in [15.17 ms, 25.81 ms]
Moyenne: 726.13 ms Ecart-type 4.97 ms (with 2 runs) in [721.15 ms, 731.10 ms]
batch = 10000
Moyenne: 6.86 ms Ecart-type 920.91 µs (with 10 runs) in [5.11 ms, 8.51 ms]
Moyenne: 593.54 ms Ecart-type 205.58 ms (with 2 runs) in [387.96 ms, 799.12 ms]
Moyenne: 43.68 ms Ecart-type 5.21 ms (with 10 runs) in [36.71 ms, 56.40 ms]
Moyenne: 1.60 s Ecart-type 147.62 ms (with 2 runs) in [1.46 s, 1.75 s]
batch = 20000
Moyenne: 16.04 ms Ecart-type 1.85 ms (with 10 runs) in [12.46 ms, 19.16 ms]
Moyenne: 376.87 ms Ecart-type 46.45 ms (with 2 runs) in [330.42 ms, 423.31 ms]
Moyenne: 86.40 ms Ecart-type 11.04 ms (with 10 runs) in [72.54 ms, 110.95 ms]
batch = 50000
Moyenne: 40.80 ms Ecart-type 12.48 ms (with 10 runs) in [29.52 ms, 69.32 ms]
Moyenne: 367.42 ms Ecart-type 13.03 ms (with 2 runs) in [354.39 ms, 380.45 ms]
Moyenne: 222.82 ms Ecart-type 18.00 ms (with 10 runs) in [194.52 ms, 258.90 ms]
batch = 75000
Moyenne: 61.79 ms Ecart-type 5.91 ms (with 10 runs) in [49.56 ms, 70.33 ms]
Moyenne: 407.89 ms Ecart-type 11.15 ms (with 2 runs) in [396.75 ms, 419.04 ms]
Moyenne: 331.76 ms Ecart-type 27.24 ms (with 10 runs) in [279.56 ms, 380.92 ms]
batch = 100000
Moyenne: 85.46 ms Ecart-type 5.47 ms (with 10 runs) in [73.70 ms, 92.72 ms]
Moyenne: 442.55 ms Ecart-type 16.33 ms (with 2 runs) in [426.22 ms, 458.87 ms]
Moyenne: 436.85 ms Ecart-type 32.68 ms (with 10 runs) in [385.22 ms, 487.51 ms]
dfbrf = pandas.DataFrame(memo)[["average", "lib", "batch"]]
pivrf = dfbrf.pivot("batch", "lib", "average")
for c in pivrf.columns:
    pivrf["ave_" + c] = pivrf[c] / pivrf.index
libs = list(c for c in pivrf.columns if "ave_" in c)
ax = pivrf.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch\nrandom forest")
ax.grid(True);
../_images/cffi_linear_regression_156_0.png