Optimisation de code avec cffi, numba, cython

Links: notebook, html, PDF, python, slides, GitHub

L’idée est de recoder une fonction en C. On prend comme exemple la fonction de prédiction de la régression linéaire de scikit-learn et de prévoir le gain de temps qu’on obtient en recodant la fonction dans un langage plus rapide.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
memo_time = []
import timeit

def unit(x):
    if x >= 1: return "%1.2f s" % x
    elif x >= 1e-3: return "%1.2f ms" % (x* 1000)
    elif x >= 1e-6: return "%1.2f µs" % (x* 1000**2)
    elif x >= 1e-9: return "%1.2f ns" % (x* 1000**3)
    else:
        return "%1.2g s" % x

def timeexe(legend, code, number=100, repeat=1000):
    rep = timeit.repeat(code, number=number, repeat=repeat, globals=globals())
    ave = sum(rep) / (number * repeat)
    std = (sum((x/number - ave)**2 for x in rep) / repeat)**0.5
    fir = rep[0]/number
    fir3 = sum(rep[:3]) / (3 * number)
    las3 = sum(rep[-3:]) / (3 * number)
    rep.sort()
    mini = rep[len(rep)//20] / number
    maxi = rep[-len(rep)//20] / number
    print("Moyenne: %s Ecart-type %s (with %d runs) in [%s, %s]" % (
                unit(ave), unit(std), number, unit(mini), unit(maxi)))
    return dict(legend=legend, average=ave, deviation=std, first=fir, first3=fir3,
                last3=las3, repeat=repeat, min5=mini, max5=maxi, code=code, run=number)

Régression linéaire

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(diabetes_X_train, diabetes_y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
clr.coef_
array([ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
       -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
        7.43519617e+02,  7.60951722e+01])
clr.intercept_
152.76430691633442
z = diabetes_X_test[0:1,:]
memo_time.append(timeexe("sklearn.predict", "clr.predict(z)"))
Moyenne: 37.24 µs Ecart-type 18.35 µs (with 100 runs) in [30.90 µs, 59.07 µs]
%timeit clr.predict(z)
33.4 µs ± 3.04 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

optimisation avec cffi

On s’inspire de l’exemple Purely for performance (API level, out-of-line).

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg(int, double *, double *, double, double *);")

ffibuilder.set_source("_linear_regression",
r"""
    static int linreg(int dimension, double * x, double *coef, double intercept, double * out)
    {
        for(; dimension > 0; --dimension, ++x, ++coef)
            intercept += *x * *coef;
        *out = intercept;
        return 1;
    }
""")

ffibuilder.compile(verbose=True)
generating ._linear_regression.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression.c /Fo.Release_linear_regression.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression .Release_linear_regression.obj /OUT:._linear_regression.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression.cp37-win_amd64.pyd'

La fonction compilée est accessible comme suit.

from _linear_regression import ffi, lib
lib.linreg
<function _linear_regression.CompiledLib.linreg>

On s’inspire de l’exemple How to pass a Numpy array into a cffi function and how to get one back out?.

import numpy
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
x = diabetes_X_test[0:1,:]
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
n = len(clr.coef_)
lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)
1
out
array([197.61846908])

On vérifie qu’on obtient bien la même chose.

clr.predict(x)
array([197.61846908])

Et on mesure le temps d’exécution :

memo_time.append(timeexe("cffi-linreg", "lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)"))
Moyenne: 781.50 ns Ecart-type 541.34 ns (with 100 runs) in [458.27 ns, 1.43 µs]

C’est beaucoup plus rapide. Pour être totalement honnête, il faut mesurer les étapes qui consiste à extraire les pointeurs.

def predict_clr(x, clr):
    out = numpy.zeros(1)
    ptr_coef = clr.coef_.__array_interface__['data'][0]
    cptr_coef = ffi.cast ( "double*" , ptr_coef )
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "double*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "double*" , ptr_out )
    lib.linreg(len(x), cptr_x, cptr_coef, clr.intercept_, cptr_out)
    return out

predict_clr(x, clr)
array([152.74058378])
memo_time.append(timeexe("cffi-linreg-wrapped", "predict_clr(x, clr)"))
Moyenne: 8.21 µs Ecart-type 3.68 µs (with 100 runs) in [6.86 µs, 14.30 µs]

Cela reste plus rapide.

cffi - seconde version

Comme on construit la fonction en dynamique (le code est connu lors de l’exécution), on peut facilement se passer de la boucle et écrire le code sans boucle et avec les coefficients.

res = " + ".join("{0}*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'0.3034995490660432*x[0] + -237.63931533353403*x[1] + 510.5306054362253*x[2] + 327.7369804093466*x[3] + -814.1317093725387*x[4] + 492.81458798373217*x[5] + 102.8484521916802*x[6] + 184.60648905984*x[7] + 743.519616750542*x[8] + 76.09517221662392*x[9]'
code = """
    static int linreg_custom(double * x, double * out)
    {{
        out[0] = {0} + {1};
    }}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom(double * x, double * out)
{
    out[0] = 152.76430691633442 + 0.3034995490660432*x[0] + -237.63931533353403*x[1] + 510.5306054362253*x[2] + 327.7369804093466*x[3] + -814.1317093725387*x[4] + 492.81458798373217*x[5] + 102.8484521916802*x[6] + 184.60648905984*x[7] + 743.519616750542*x[8] + 76.09517221662392*x[9];
}
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom(double *, double *);")
ffibuilder.set_source("_linear_regression_custom", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression_custom.c /Fo.Release_linear_regression_custom.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression_custom .Release_linear_regression_custom.obj /OUT:._linear_regression_custom.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom.cp37-win_amd64.pyd'
from _linear_regression_custom.lib import linreg_custom
linreg_custom(cptr_x, cptr_out)
out
array([197.61846908])
memo_time.append(timeexe("cffi-linreg-custom", "linreg_custom(cptr_x, cptr_out)"))
Moyenne: 400.93 ns Ecart-type 176.46 ns (with 100 runs) in [320.00 ns, 655.80 ns]

On a gagné un facteur 2.

def predict_clr_custom(x):
    out = numpy.zeros(1)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast("double*", ptr_x)
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast("double*", ptr_out)
    linreg_custom(cptr_x, cptr_out)
    return out

predict_clr_custom(x)
array([197.61846908])
memo_time.append(timeexe("cffi-linreg-custom wrapped", "predict_clr_custom(x)"))
Moyenne: 5.53 µs Ecart-type 2.17 µs (with 100 runs) in [4.85 µs, 8.51 µs]

C’est un peu plus rapide.

et en float?

L’ordinateur fait la distinction entre les double code sur 64 bit et les float codé sur 32 bits. La précision est meilleure dans le premier cas et les calculs sont plus rapides dans le second. Dans le cas du machine learning, on préfère la rapidité à une perte précision en précision qui est souvent compensée par l’optimisation inhérente à tout problème de machine learning. Ce qu’on perd sur une observation, on le retrouve sur une autre.

res = " + ".join("{0}f*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'0.3034995490660432f*x[0] + -237.63931533353403f*x[1] + 510.5306054362253f*x[2] + 327.7369804093466f*x[3] + -814.1317093725387f*x[4] + 492.81458798373217f*x[5] + 102.8484521916802f*x[6] + 184.60648905984f*x[7] + 743.519616750542f*x[8] + 76.09517221662392f*x[9]'
code = """
    static int linreg_custom_float(float * x, float * out)
    {{
        out[0] = {0}f + {1};
    }}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom_float(float * x, float * out)
{
    out[0] = 152.76430691633442f + 0.3034995490660432f*x[0] + -237.63931533353403f*x[1] + 510.5306054362253f*x[2] + 327.7369804093466f*x[3] + -814.1317093725387f*x[4] + 492.81458798373217f*x[5] + 102.8484521916802f*x[6] + 184.60648905984f*x[7] + 743.519616750542f*x[8] + 76.09517221662392f*x[9];
}
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression_custom_float.c /Fo.Release_linear_regression_custom_float.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression_custom_float .Release_linear_regression_custom_float.obj /OUT:._linear_regression_custom_float.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float.cp37-win_amd64.pyd'
from _linear_regression_custom_float.lib import linreg_custom_float
def predict_clr_custom_float(x):
    out = numpy.zeros(1, dtype=numpy.float32)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "float*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "float*" , ptr_out )
    linreg_custom_float(cptr_x, cptr_out)
    return out

Avant d’appeler la fonction, on doit transformer le vecteur iniatial en float32.

x32 = x.astype(numpy.float32)
predict_clr_custom(x32)
array([152.76430692])
memo_time.append(timeexe("cffi-linreg-custom-float wrapped", "predict_clr_custom(x32)"))
Moyenne: 7.21 µs Ecart-type 3.59 µs (with 100 runs) in [5.15 µs, 13.26 µs]

La différence n’est pas flagrante. Mesurons le code C uniquement même si la partie Python ne peut pas être complètement évitée.

out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

memo_time.append(timeexe("cffi-linreg-custom-float32", "linreg_custom_float(cptr_x, cptr_out)"))
Moyenne: 499.05 ns Ecart-type 250.34 ns (with 100 runs) in [331.85 ns, 1.05 µs]

La différence n’est pas significative.

SIMD

C’est un ensemble d’instructions processeur pour faire des opérations terme à terme sur 4 float32 aussi rapidement qu’une seule. Le processeur ne peut faire des opérations que les nombres sont copiés dans ses registres. Le programme passe alors son temps à copier des nombres depuis la mémoire vers les registres du processeur puis à faire la copie dans le chemin inverse pour le résultat. Les instructions SIMD font gagner du temps du niveau du calcul. Au lieu de faire 4 opérations de multiplication terme à terme, il n’en fait plus qu’une. Il suffit de savoir comment utiliser ces instructions. Avec Visual Studio, elles sont accessible via ces fonctions Memory and Initialization Using Streaming SIMD Extensions. Le code suivant n’est probablement pas optimal mais il n’est pas trop compliqué à suivre.

code = """
#include <xmmintrin.h>

static int linreg_custom_float_simd(float * x, float * out)
{
    __m128 c1 = _mm_set_ps(0.3034995490664121f, -237.63931533353392f, 510.5306054362245f, 327.7369804093466f);
    __m128 c2 = _mm_set_ps(-814.1317093725389f, 492.81458798373245f, 102.84845219168025f, 184.60648905984064f);
    __m128 r1 = _mm_set_ss(152.76430691633442f);
    r1 = _mm_add_ss(r1, _mm_mul_ps(c1, _mm_load_ps(x)));
    r1 = _mm_add_ss(r1, _mm_mul_ps(c2, _mm_load_ps(x+4)));
    float r[4];
    _mm_store_ps(r, r1);
    out[0] = r[0] + r[1] + r[2] + r[3] + 743.5196167505419f * x[8] + 76.095172216624f * x[9];
    return 1;
}
"""
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float_simd(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float_simd", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float_simd.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float_simd' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python370_x64include -Ic:python370_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression_custom_float_simd.c /Fo.Release_linear_regression_custom_float_simd.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python370_x64libs /LIBPATH:c:python370_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression_custom_float_simd .Release_linear_regression_custom_float_simd.obj /OUT:._linear_regression_custom_float_simd.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float_simd.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float_simd.cp37-win_amd64.pyd'
from _linear_regression_custom_float_simd.lib import linreg_custom_float_simd
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

linreg_custom_float_simd(cptr_x, cptr_out)
out
array([171.1178], dtype=float32)
memo_time.append(timeexe("cffi-linreg-custom-float32-simd", "linreg_custom_float_simd(cptr_x, cptr_out)"))
Moyenne: 515.47 ns Ecart-type 663.25 ns (with 100 runs) in [316.05 ns, 987.65 ns]

C’est légèrement mieux, quelques références :

Les processeurs évoluent au fil du temps, 4 float, 8 float, SIMD2, FMA4 Intrinsics Added for Visual Studio 2010 SP1, AVX.

Réécriture purement Python

On continue avec uniquement du Python sans numpy.

coef = clr.coef_
list(coef)
[0.3034995490660432,
 -237.63931533353403,
 510.5306054362253,
 327.7369804093466,
 -814.1317093725387,
 492.81458798373217,
 102.8484521916802,
 184.60648905984,
 743.519616750542,
 76.09517221662392]
code = str(clr.intercept_) + "+" + "+".join("x[{0}]*({1})".format(i, c) for i, c in enumerate(coef))
code
'152.76430691633442+x[0]*(0.3034995490660432)+x[1]*(-237.63931533353403)+x[2]*(510.5306054362253)+x[3]*(327.7369804093466)+x[4]*(-814.1317093725387)+x[5]*(492.81458798373217)+x[6]*(102.8484521916802)+x[7]*(184.60648905984)+x[8]*(743.519616750542)+x[9]*(76.09517221662392)'
def predict_clr_python(x):
    return 152.764306916+x[0]*0.3034995490664121+x[1]*(-237.63931533353392)+x[2]*510.5306054362245+ \
            x[3]*327.7369804093466+ \
            x[4]*(-814.1317093725389)+x[5]*492.81458798373245+x[6]*102.84845219168025+ \
            x[7]*184.60648905984064+x[8]*743.5196167505419+x[9]*76.095172216624

predict_clr_python(x[0])
197.61846907469848
z = list(x[0])
memo_time.append(timeexe("python-linreg-custom", "predict_clr_python(z)"))
Moyenne: 3.15 µs Ecart-type 1.81 µs (with 100 runs) in [2.00 µs, 6.32 µs]

De façon assez surprenante, c’est plutôt rapide. Et si on y mettait une boucle.

def predict_clr_python_loop(x, coef, intercept):
    return intercept + sum(a*b for a, b in zip(x, coef))

predict_clr_python_loop(x[0], list(clr.coef_), clr.intercept_)
197.61846907503298
coef = list(clr.coef_)
intercept = clr.intercept_
memo_time.append(timeexe("python-linreg", "predict_clr_python_loop(z, coef, intercept)"))
Moyenne: 5.87 µs Ecart-type 3.95 µs (with 100 runs) in [3.48 µs, 10.08 µs]

A peine plus long.

Réécriture avec Python et numpy

def predict_clr_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numpy(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numpy-linreg-numpy", "predict_clr_numpy(z, coef, clr.intercept_)"))
Moyenne: 9.97 µs Ecart-type 4.96 µs (with 100 runs) in [7.37 µs, 19.62 µs]

Les dimensions des tableaux sont trop petites pour que le calcul matriciel apporte une différence. On se retrouve dans le cas cffi où les échanges Python - C grignotent tout le temps de calcul.

numba

numba essaye de compiler à la volée des bouts de codes écrits en Python. On induque quelle fonction optimiser en faisant précéder la fonction de @jit. Toutes les écritures ne fonctionnent, typiquement, certaines listes en compréhension soulèvent une exception. Il faut donc écrire son code en Python d’une façon assez proche de ce qu’il serait en C.

from numba import jit
@jit
def predict_clr_numba(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba(z, clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-notype", "predict_clr_numba(z, clr.coef_, clr.intercept_)"))
Moyenne: 32.13 µs Ecart-type 15.74 µs (with 100 runs) in [20.77 µs, 59.50 µs]

Plutôt rapide !

@jit('double(double[:], double[:], double)')
def predict_clr_numba_cast(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-type", "predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 1.30 µs Ecart-type 1.45 µs (with 100 runs) in [821.72 ns, 2.50 µs]

On voit que plus on donne d’information au compilateur, plus il est capable d’optimiser.

@jit('float32(float32[:], float32[:], float32)')
def predict_clr_numba_cast_float(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
x32 = x[0].astype(numpy.float32)
c32 = clr.coef_.astype(numpy.float32)
i32 = numpy.float32(clr.intercept_)
predict_clr_numba_cast_float(x32, c32, i32)
197.61846923828125
memo_time.append(timeexe("numba-linreg-type-float32", "predict_clr_numba_cast_float(x32, c32, i32)"))
Moyenne: 1.17 µs Ecart-type 5.36 µs (with 100 runs) in [580.73 ns, 2.03 µs]

On essaye avec les coefficients dans la fonction.

@jit('double(double[:])')
def predict_clr_numba_cast_custom(x):
    coef = [ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
            -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
             7.43519617e+02,  7.60951722e+01]
    s = 152.76430691633442
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba_cast_custom(x[0])
197.61846907190048
memo_time.append(timeexe("numba-linreg-type-custom", "predict_clr_numba_cast_custom(x[0])"))
Moyenne: 958.94 ns Ecart-type 492.60 ns (with 100 runs) in [687.40 ns, 1.90 µs]

On se rapproche des temps obtenus avec cffi sans wrapping, cela signifie que numba fait un bien meilleur travail à ce niveau que le wrapper rapidement créé.

@jit('double(double[:], double[:], double)')
def predict_clr_numba_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-type-numpy", "predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 8.31 µs Ecart-type 4.03 µs (with 100 runs) in [5.84 µs, 15.30 µs]

numba est moins performant quand numpy est impliqué car le code de numpy n’est pas réécrit, il est appelé.

cython

cython permet de créer des extensions C de plus grande envergure que numba. C’est l’option choisie par scikit-learn. Il vaut mieux connaître le C pour s’en servir et là encore, l’objectif est de réduire les échanges Python / C qui coûtent cher.

%load_ext cython
%%cython
def predict_clr_cython(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s
predict_clr_cython(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("cython-linreg", "predict_clr_cython(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 3.55 µs Ecart-type 2.19 µs (with 100 runs) in [2.28 µs, 7.82 µs]

Cython fait moins bien que numba dans notre cas et l’optimisation proposée est assez proche du temps déjà obtenue avec le langage Python seul. Cela est dû au fait que la plupart des objets tels que du code associé aux listes ou aux dictionnaires ont été réécrits en C.

%%cython
cimport numpy as npc

def predict_clr_cython_type(npc.ndarray[double, ndim=1, mode='c'] x,
                            npc.ndarray[double, ndim=1, mode='c'] coef,
                            double intercept):
    cdef double s = intercept
    for i in range(0, x.shape[0]):
        s += coef[i] * x[i]
    return s
predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("cython-linreg-type", "predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 1.82 µs Ecart-type 1.10 µs (with 100 runs) in [1.37 µs, 3.66 µs]

Le temps est quasi identique avec un écart type moins grand de façon significative.

Une dernière option : ONNX

ONNX est un format de sérialisation qui permet de décrire un modèle de modèle de machine learning ou de deep learning. Cela permet de dissocer le modèle de la librairie qui a servi à le produire (voir ML.net and ONNX.

try:
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType
    import onnxruntime
    import onnx
    ok_onnx = True
    print("onnx, skl2onnx, onnxruntime sont disponibles.")

    def save_model(onnx_model, filename):
        with open(filename, "wb") as f:
            f.write(onnx_model.SerializeToString())
except ImportError as e:
    print("La suite requiert onnx, skl2onnx et onnxruntime.")
    print(e)
    ok_onnx = False
onnx, skl2onnx, onnxruntime sont disponibles.

On convertit le modèle au format ONNX.

if ok_onnx:
    onnx_model = convert_sklearn(clr, 'model', [('input', FloatTensorType([1, clr.coef_.shape[0]]))])
    save_model(onnx_model, 'model.onnx')

    model_onnx = onnx.load('model.onnx')
    print("Modèle sérialisé au format ONNX")
    print(model_onnx)
else:
    print("onnx, onnxmltools, onnxruntime sont disponibles.")
The maximum opset needed by this model is only 1.
Modèle sérialisé au format ONNX
ir_version: 4
producer_name: "skl2onnx"
producer_version: "1.4.3"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
  node {
    input: "input"
    output: "variable"
    name: "LinearRegressor"
    op_type: "LinearRegressor"
    attribute {
      name: "coefficients"
      floats: 0.3034995496273041
      floats: -237.63931274414062
      floats: 510.5306091308594
      floats: 327.7369689941406
      floats: -814.1317138671875
      floats: 492.8145751953125
      floats: 102.84844970703125
      floats: 184.6064910888672
      floats: 743.5195922851562
      floats: 76.09516906738281
      type: FLOATS
    }
    attribute {
      name: "intercepts"
      floats: 152.76431274414062
      type: FLOATS
    }
    domain: "ai.onnx.ml"
  }
  name: "model"
  input {
    name: "input"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 10
          }
        }
      }
    }
  }
  output {
    name: "variable"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 1
          }
        }
      }
    }
  }
}
opset_import {
  domain: "ai.onnx.ml"
  version: 1
}

On calcule les prédictions. Le module {onnxruntime](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) optimise les calculs pour des modèles de deep learning. Cela explique pourquoi tous les calculs sont réalisés avec des réels représentés sur 4 octets numpy.float32.

if ok_onnx:
    sess = onnxruntime.InferenceSession("model.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt(x):
        return sess.run(["variable"], {'input': x})

    print("Prediction:", predict_onnxrt(x.astype(numpy.float32)))
Input: NodeArg(name='input', type='tensor(float)', shape=[1, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[1, 1])
Prediction: [array([[197.61847]], dtype=float32)]
if ok_onnx:
    x32 = x.astype(numpy.float32)
    memo_time.append(timeexe("onnxruntime-float32", "predict_onnxrt(x32)"))
    memo_time.append(timeexe("onnxruntime-float64", "predict_onnxrt(x.astype(numpy.float32))"))
Moyenne: 15.61 µs Ecart-type 6.97 µs (with 100 runs) in [11.03 µs, 28.25 µs]
Moyenne: 16.05 µs Ecart-type 6.55 µs (with 100 runs) in [12.40 µs, 31.08 µs]

Récapitulatif

import pandas
df = pandas.DataFrame(data=memo_time)
df = df.set_index("legend").sort_values("average")
df
average code deviation first first3 last3 max5 min5 repeat run
legend
cffi-linreg-custom 4.009315e-07 linreg_custom(cptr_x, cptr_out) 1.764600e-07 6.242000e-07 4.846133e-07 3.358033e-07 6.558000e-07 3.200000e-07 1000 100
cffi-linreg-custom-float32 4.990487e-07 linreg_custom_float(cptr_x, cptr_out) 2.503353e-07 1.872590e-06 1.696127e-06 3.318500e-07 1.050860e-06 3.318500e-07 1000 100
cffi-linreg-custom-float32-simd 5.154674e-07 linreg_custom_float_simd(cptr_x, cptr_out) 6.632466e-07 1.426170e-06 8.032900e-07 3.186800e-07 9.876500e-07 3.160500e-07 1000 100
cffi-linreg 7.814973e-07 lib.linreg(n, cptr_x, cptr_coef, clr.intercept... 5.413422e-07 2.014810e-06 1.189137e-06 4.635367e-07 1.434070e-06 4.582700e-07 1000 100
numba-linreg-type-custom 9.589350e-07 predict_clr_numba_cast_custom(x[0]) 4.925960e-07 2.378270e-06 1.643453e-06 7.045267e-07 1.904190e-06 6.874000e-07 1000 100
numba-linreg-type-float32 1.172766e-06 predict_clr_numba_cast_float(x32, c32, i32) 5.360108e-06 1.923940e-06 1.834397e-06 5.820567e-07 2.034570e-06 5.807300e-07 1000 100
numba-linreg-type 1.298707e-06 predict_clr_numba_cast(x[0], clr.coef_, clr.in... 1.449341e-06 1.536790e-06 1.609217e-06 8.230467e-07 2.496790e-06 8.217200e-07 1000 100
cython-linreg-type 1.823557e-06 predict_clr_cython_type(x[0], clr.coef_, clr.i... 1.099262e-06 2.686420e-06 2.604770e-06 1.373493e-06 3.662210e-06 1.370860e-06 1000 100
python-linreg-custom 3.145976e-06 predict_clr_python(z) 1.809140e-06 2.050370e-06 2.639007e-06 2.004277e-06 6.317020e-06 1.999000e-06 1000 100
cython-linreg 3.549171e-06 predict_clr_cython(x[0], clr.coef_, clr.interc... 2.192735e-06 2.374320e-06 5.358343e-06 2.312423e-06 7.818250e-06 2.275550e-06 1000 100
cffi-linreg-custom wrapped 5.525170e-06 predict_clr_custom(x) 2.166771e-06 2.072489e-05 1.716407e-05 5.105500e-06 8.505650e-06 4.851350e-06 1000 100
python-linreg 5.870797e-06 predict_clr_python_loop(z, coef, intercept) 3.951088e-06 7.553560e-06 7.607550e-06 8.828290e-06 1.008195e-05 3.476540e-06 1000 100
cffi-linreg-custom-float wrapped 7.205320e-06 predict_clr_custom(x32) 3.586173e-06 1.277626e-05 2.182579e-05 5.596693e-06 1.325824e-05 5.151590e-06 1000 100
cffi-linreg-wrapped 8.209533e-06 predict_clr(x, clr) 3.681237e-06 2.280685e-05 2.301624e-05 6.889860e-06 1.430121e-05 6.862210e-06 1000 100
numba-linreg-type-numpy 8.311518e-06 predict_clr_numba_numpy(x[0], clr.coef_, clr.i... 4.030476e-06 1.464885e-05 1.980176e-05 6.020723e-06 1.530465e-05 5.839000e-06 1000 100
numpy-linreg-numpy 9.970611e-06 predict_clr_numpy(z, coef, clr.intercept_) 4.957268e-06 2.051550e-05 2.859581e-05 7.527227e-06 1.961871e-05 7.367890e-06 1000 100
onnxruntime-float32 1.560984e-05 predict_onnxrt(x32) 6.973393e-06 1.163454e-05 1.123553e-05 1.194664e-05 2.825475e-05 1.102615e-05 1000 100
onnxruntime-float64 1.605105e-05 predict_onnxrt(x.astype(numpy.float32)) 6.550978e-06 1.372046e-05 1.322795e-05 1.264326e-05 3.108338e-05 1.240491e-05 1000 100
numba-linreg-notype 3.212648e-05 predict_clr_numba(z, clr.coef_, clr.intercept_) 1.574220e-05 5.915245e-05 8.600210e-05 3.796139e-05 5.950406e-05 2.077230e-05 1000 100
sklearn.predict 3.723909e-05 clr.predict(z) 1.834971e-05 8.333412e-05 8.313264e-05 3.150083e-05 5.907344e-05 3.090165e-05 1000 100

On enlève quelques colonnes et on rappelle :

  • cffi: signifie optimisé avec cffi

  • custom: pas de boucle mais la fonction ne peut prédire qu’une seule régression linéaire

  • float32: utilise des float et non des double

  • linreg: régression linéaire

  • numba: optimisation avec numba

  • numpy: optimisation avec numpy

  • python: pas de C, que du python

  • simd: optimisé avec les instructions SIMD

  • sklearn: fonction sklearn.predict

  • static: la fonction utilise des variables statiques

  • type: la fonction est typée et ne fonctionne qu’avec un type précis en entrée.

  • wrapped: code optimisé mais embabllé dans une fonction Python qui elle ne l’est pas (les containers sont recréés à chaque fois)

cols = ["average", "deviation", "min5", "max5", "run", "code"]
df[cols]
average deviation min5 max5 run code
legend
cffi-linreg-custom 4.009315e-07 1.764600e-07 3.200000e-07 6.558000e-07 100 linreg_custom(cptr_x, cptr_out)
cffi-linreg-custom-float32 4.990487e-07 2.503353e-07 3.318500e-07 1.050860e-06 100 linreg_custom_float(cptr_x, cptr_out)
cffi-linreg-custom-float32-simd 5.154674e-07 6.632466e-07 3.160500e-07 9.876500e-07 100 linreg_custom_float_simd(cptr_x, cptr_out)
cffi-linreg 7.814973e-07 5.413422e-07 4.582700e-07 1.434070e-06 100 lib.linreg(n, cptr_x, cptr_coef, clr.intercept...
numba-linreg-type-custom 9.589350e-07 4.925960e-07 6.874000e-07 1.904190e-06 100 predict_clr_numba_cast_custom(x[0])
numba-linreg-type-float32 1.172766e-06 5.360108e-06 5.807300e-07 2.034570e-06 100 predict_clr_numba_cast_float(x32, c32, i32)
numba-linreg-type 1.298707e-06 1.449341e-06 8.217200e-07 2.496790e-06 100 predict_clr_numba_cast(x[0], clr.coef_, clr.in...
cython-linreg-type 1.823557e-06 1.099262e-06 1.370860e-06 3.662210e-06 100 predict_clr_cython_type(x[0], clr.coef_, clr.i...
python-linreg-custom 3.145976e-06 1.809140e-06 1.999000e-06 6.317020e-06 100 predict_clr_python(z)
cython-linreg 3.549171e-06 2.192735e-06 2.275550e-06 7.818250e-06 100 predict_clr_cython(x[0], clr.coef_, clr.interc...
cffi-linreg-custom wrapped 5.525170e-06 2.166771e-06 4.851350e-06 8.505650e-06 100 predict_clr_custom(x)
python-linreg 5.870797e-06 3.951088e-06 3.476540e-06 1.008195e-05 100 predict_clr_python_loop(z, coef, intercept)
cffi-linreg-custom-float wrapped 7.205320e-06 3.586173e-06 5.151590e-06 1.325824e-05 100 predict_clr_custom(x32)
cffi-linreg-wrapped 8.209533e-06 3.681237e-06 6.862210e-06 1.430121e-05 100 predict_clr(x, clr)
numba-linreg-type-numpy 8.311518e-06 4.030476e-06 5.839000e-06 1.530465e-05 100 predict_clr_numba_numpy(x[0], clr.coef_, clr.i...
numpy-linreg-numpy 9.970611e-06 4.957268e-06 7.367890e-06 1.961871e-05 100 predict_clr_numpy(z, coef, clr.intercept_)
onnxruntime-float32 1.560984e-05 6.973393e-06 1.102615e-05 2.825475e-05 100 predict_onnxrt(x32)
onnxruntime-float64 1.605105e-05 6.550978e-06 1.240491e-05 3.108338e-05 100 predict_onnxrt(x.astype(numpy.float32))
numba-linreg-notype 3.212648e-05 1.574220e-05 2.077230e-05 5.950406e-05 100 predict_clr_numba(z, clr.coef_, clr.intercept_)
sklearn.predict 3.723909e-05 1.834971e-05 3.090165e-05 5.907344e-05 100 clr.predict(z)
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(14,6))
df[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                  legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
../_images/cffi_linear_regression_109_0.png

Il manque à ce comparatif le GPU mais c’est un peu plus complexe à mettre en oeuvre, il faut une carte GPU et la parallélisation n’apporterait pas énormément compte tenu de la faible dimension du problème.

Prédiction one-off et biais de mesure

Le graphique précédent montre que la fonction predict de scikit-learn est la plus lente. La première raison est que ce code est valable pour toutes les régresssions linéaires alors que toutes les autres fonctions sont spécialisées pour un seul modèle. La seconde raison est que le code de scikit-learn est optimisé pour le calcul de plusieurs prédictions à la fois alors que toutes les autres fonctions n’en calcule qu’une seule (scénario dit one-off). On compare à ce que donnerait unev version purement python et numpy.

def predict_clr_python_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.zeros((x.shape[0], 1))
    res[:, 0] = intercept
    for i in range(0, x.shape[0]):
        res[i, 0] += sum(a*b for a, b in zip(x[i, :], coef))
    return res

predict_clr_python_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[197.61846908],
       [155.43979328]])
def predict_clr_numpy_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.ones((x.shape[0], 1)) * intercept
    res += x @ coef.reshape((len(coef), 1))
    return res

predict_clr_numpy_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[197.61846908],
       [155.43979328]])
def predict_clr_numba_cast_multi(X, coef, intercept):
    return [predict_clr_numba_cast(x, coef, intercept) for x in X]

predict_clr_numba_cast_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[197.61846907503298, 155.43979327521237]
def predict_clr_cython_type_multi(X, coef, intercept):
    return [predict_clr_cython_type(x, coef, intercept) for x in X]

predict_clr_cython_type_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[197.61846907503298, 155.43979327521237]
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
for i in batch:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)
    repeat=20 if i >= 5000 else 100

    memo.append(timeexe("sklearn.predict %d" % i, "clr.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if i <= 1000:
        # très lent
        memo.append(timeexe("python %d" % i, "predict_clr_python_loop_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=20, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "python"

    memo.append(timeexe("numpy %d" % i, "predict_clr_numpy_loop_multi(mx, clr.coef_, clr.intercept_)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "numpy"

    if i <= 10000:
        # très lent
        memo.append(timeexe("numba %d" % i, "predict_clr_numba_cast_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "numba"

    if i <= 1000:
        # très lent
        memo.append(timeexe("cython %d" % i, "predict_clr_cython_type_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "cython"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 61.72 µs Ecart-type 20.51 µs (with 10 runs) in [36.70 µs, 100.19 µs]
Moyenne: 12.42 µs Ecart-type 6.08 µs (with 10 runs) in [9.92 µs, 36.42 µs]
Moyenne: 8.15 µs Ecart-type 1.73 µs (with 10 runs) in [7.47 µs, 9.17 µs]
Moyenne: 2.38 µs Ecart-type 508.42 ns (with 10 runs) in [2.21 µs, 3.16 µs]
Moyenne: 3.34 µs Ecart-type 2.32 µs (with 10 runs) in [2.65 µs, 6.76 µs]
Moyenne: 20.95 µs Ecart-type 10.21 µs (with 10 runs) in [14.85 µs, 40.61 µs]
batch = 10
Moyenne: 51.02 µs Ecart-type 19.16 µs (with 10 runs) in [36.46 µs, 93.71 µs]
Moyenne: 131.17 µs Ecart-type 64.49 µs (with 10 runs) in [80.24 µs, 261.33 µs]
Moyenne: 7.72 µs Ecart-type 653.54 ns (with 10 runs) in [7.59 µs, 7.74 µs]
Moyenne: 10.04 µs Ecart-type 5.02 µs (with 10 runs) in [9.01 µs, 15.13 µs]
Moyenne: 19.72 µs Ecart-type 9.62 µs (with 10 runs) in [13.95 µs, 36.23 µs]
Moyenne: 36.17 µs Ecart-type 6.33 µs (with 10 runs) in [31.92 µs, 42.75 µs]
batch = 100
Moyenne: 74.33 µs Ecart-type 43.73 µs (with 10 runs) in [35.67 µs, 152.30 µs]
Moyenne: 1.21 ms Ecart-type 478.46 µs (with 10 runs) in [782.46 µs, 2.28 ms]
Moyenne: 13.86 µs Ecart-type 9.37 µs (with 10 runs) in [8.18 µs, 34.17 µs]
Moyenne: 91.47 µs Ecart-type 39.33 µs (with 10 runs) in [75.06 µs, 205.27 µs]
Moyenne: 166.69 µs Ecart-type 74.59 µs (with 10 runs) in [126.10 µs, 332.92 µs]
Moyenne: 41.07 µs Ecart-type 15.02 µs (with 10 runs) in [26.03 µs, 69.77 µs]
batch = 200
Moyenne: 67.71 µs Ecart-type 42.18 µs (with 10 runs) in [36.15 µs, 126.30 µs]
Moyenne: 2.53 ms Ecart-type 734.36 µs (with 10 runs) in [1.65 ms, 4.07 ms]
Moyenne: 16.80 µs Ecart-type 6.97 µs (with 10 runs) in [9.05 µs, 24.02 µs]
Moyenne: 182.37 µs Ecart-type 73.63 µs (with 10 runs) in [149.81 µs, 416.43 µs]
Moyenne: 448.78 µs Ecart-type 206.74 µs (with 10 runs) in [252.88 µs, 841.91 µs]
Moyenne: 47.45 µs Ecart-type 22.24 µs (with 10 runs) in [32.47 µs, 82.41 µs]
batch = 500
Moyenne: 44.61 µs Ecart-type 13.69 µs (with 10 runs) in [39.59 µs, 90.75 µs]
Moyenne: 5.09 ms Ecart-type 923.90 µs (with 10 runs) in [3.97 ms, 7.15 ms]
Moyenne: 10.60 µs Ecart-type 1.09 µs (with 10 runs) in [10.39 µs, 10.51 µs]
Moyenne: 442.09 µs Ecart-type 128.29 µs (with 10 runs) in [355.32 µs, 823.74 µs]
Moyenne: 777.43 µs Ecart-type 208.18 µs (with 10 runs) in [607.56 µs, 1.27 ms]
Moyenne: 96.23 µs Ecart-type 50.76 µs (with 10 runs) in [60.33 µs, 219.46 µs]
batch = 1000
Moyenne: 67.87 µs Ecart-type 56.17 µs (with 10 runs) in [44.05 µs, 222.46 µs]
Moyenne: 10.01 ms Ecart-type 1.75 ms (with 10 runs) in [7.98 ms, 14.46 ms]
Moyenne: 17.70 µs Ecart-type 10.98 µs (with 10 runs) in [13.59 µs, 34.69 µs]
Moyenne: 861.28 µs Ecart-type 216.40 µs (with 10 runs) in [721.14 µs, 1.42 ms]
Moyenne: 1.25 ms Ecart-type 83.58 µs (with 10 runs) in [1.21 ms, 1.46 ms]
Moyenne: 112.48 µs Ecart-type 19.49 µs (with 10 runs) in [105.96 µs, 147.99 µs]
batch = 2000
Moyenne: 66.24 µs Ecart-type 13.14 µs (with 10 runs) in [53.02 µs, 85.93 µs]
Moyenne: 16.39 µs Ecart-type 570.74 ns (with 10 runs) in [16.16 µs, 17.03 µs]
Moyenne: 1.48 ms Ecart-type 91.96 µs (with 10 runs) in [1.42 ms, 1.75 ms]
Moyenne: 204.37 µs Ecart-type 12.70 µs (with 10 runs) in [197.41 µs, 237.55 µs]
batch = 3000
Moyenne: 67.97 µs Ecart-type 19.24 µs (with 10 runs) in [58.67 µs, 101.97 µs]
Moyenne: 22.16 µs Ecart-type 5.87 µs (with 10 runs) in [19.52 µs, 29.83 µs]
Moyenne: 2.26 ms Ecart-type 157.83 µs (with 10 runs) in [2.16 ms, 2.67 ms]
Moyenne: 303.46 µs Ecart-type 38.51 µs (with 10 runs) in [288.95 µs, 395.06 µs]
batch = 4000
Moyenne: 76.48 µs Ecart-type 17.38 µs (with 10 runs) in [64.83 µs, 88.18 µs]
Moyenne: 23.36 µs Ecart-type 5.57 µs (with 10 runs) in [22.24 µs, 25.17 µs]
Moyenne: 2.92 ms Ecart-type 152.70 µs (with 10 runs) in [2.85 ms, 3.20 ms]
Moyenne: 396.96 µs Ecart-type 46.83 µs (with 10 runs) in [380.32 µs, 429.04 µs]
batch = 5000
Moyenne: 90.18 µs Ecart-type 6.40 µs (with 10 runs) in [89.05 µs, 111.25 µs]
Moyenne: 24.74 µs Ecart-type 1.37 µs (with 10 runs) in [24.38 µs, 30.70 µs]
Moyenne: 3.81 ms Ecart-type 294.09 µs (with 10 runs) in [3.62 ms, 4.88 ms]
Moyenne: 483.24 µs Ecart-type 20.95 µs (with 10 runs) in [471.78 µs, 569.48 µs]
batch = 10000
Moyenne: 103.63 µs Ecart-type 6.08 µs (with 10 runs) in [101.02 µs, 127.53 µs]
Moyenne: 39.31 µs Ecart-type 3.28 µs (with 10 runs) in [37.10 µs, 49.98 µs]
Moyenne: 7.50 ms Ecart-type 461.95 µs (with 10 runs) in [7.19 ms, 8.81 ms]
Moyenne: 1.24 ms Ecart-type 319.43 µs (with 10 runs) in [967.50 µs, 1.82 ms]
batch = 20000
Moyenne: 196.19 µs Ecart-type 42.92 µs (with 10 runs) in [170.47 µs, 354.37 µs]
Moyenne: 70.45 µs Ecart-type 11.47 µs (with 10 runs) in [62.46 µs, 103.39 µs]
Moyenne: 2.08 ms Ecart-type 244.46 µs (with 10 runs) in [1.88 ms, 2.60 ms]
batch = 50000
Moyenne: 944.13 µs Ecart-type 176.43 µs (with 10 runs) in [673.07 µs, 1.28 ms]
Moyenne: 323.42 µs Ecart-type 60.33 µs (with 10 runs) in [283.26 µs, 487.47 µs]
Moyenne: 7.10 ms Ecart-type 691.55 µs (with 10 runs) in [6.69 ms, 9.75 ms]
batch = 75000
Moyenne: 1.28 ms Ecart-type 170.95 µs (with 10 runs) in [1.10 ms, 1.88 ms]
Moyenne: 337.19 µs Ecart-type 35.55 µs (with 10 runs) in [302.46 µs, 398.42 µs]
Moyenne: 10.44 ms Ecart-type 649.35 µs (with 10 runs) in [10.10 ms, 12.47 ms]
batch = 100000
Moyenne: 1.75 ms Ecart-type 199.38 µs (with 10 runs) in [1.56 ms, 2.31 ms]
Moyenne: 487.81 µs Ecart-type 49.89 µs (with 10 runs) in [426.90 µs, 566.83 µs]
Moyenne: 13.82 ms Ecart-type 307.45 µs (with 10 runs) in [13.60 ms, 14.58 ms]
batch = 150000
Moyenne: 3.92 ms Ecart-type 285.28 µs (with 10 runs) in [3.72 ms, 4.99 ms]
Moyenne: 2.69 ms Ecart-type 106.66 µs (with 10 runs) in [2.54 ms, 2.95 ms]
Moyenne: 20.91 ms Ecart-type 750.62 µs (with 10 runs) in [20.53 ms, 23.89 ms]
batch = 200000
Moyenne: 5.33 ms Ecart-type 228.93 µs (with 10 runs) in [5.04 ms, 6.02 ms]
Moyenne: 3.58 ms Ecart-type 140.76 µs (with 10 runs) in [3.35 ms, 3.89 ms]
Moyenne: 28.24 ms Ecart-type 1.46 ms (with 10 runs) in [27.45 ms, 33.64 ms]
batch = 300000
Moyenne: 9.47 ms Ecart-type 1.21 ms (with 10 runs) in [8.04 ms, 11.99 ms]
Moyenne: 6.58 ms Ecart-type 923.96 µs (with 10 runs) in [5.45 ms, 8.43 ms]
Moyenne: 44.73 ms Ecart-type 2.43 ms (with 10 runs) in [42.35 ms, 50.52 ms]
batch = 400000
Moyenne: 11.12 ms Ecart-type 751.22 µs (with 10 runs) in [10.08 ms, 12.47 ms]
Moyenne: 7.29 ms Ecart-type 406.01 µs (with 10 runs) in [6.81 ms, 8.54 ms]
Moyenne: 58.54 ms Ecart-type 2.16 ms (with 10 runs) in [56.73 ms, 64.76 ms]
batch = 500000
Moyenne: 13.46 ms Ecart-type 799.10 µs (with 10 runs) in [12.65 ms, 15.37 ms]
Moyenne: 9.22 ms Ecart-type 720.11 µs (with 10 runs) in [8.68 ms, 11.41 ms]
Moyenne: 72.30 ms Ecart-type 1.37 ms (with 10 runs) in [70.96 ms, 74.87 ms]
batch = 600000
Moyenne: 15.86 ms Ecart-type 583.58 µs (with 10 runs) in [15.16 ms, 17.41 ms]
Moyenne: 10.88 ms Ecart-type 380.11 µs (with 10 runs) in [10.29 ms, 11.95 ms]
Moyenne: 87.84 ms Ecart-type 3.57 ms (with 10 runs) in [84.77 ms, 97.15 ms]
dfb = pandas.DataFrame(memo)[["average", "lib", "batch"]]
piv = dfb.pivot("batch", "lib", "average")
piv
lib cython numba numpy onnxruntime python sklearn
batch
1 0.000003 0.000002 0.000008 0.000021 0.000012 0.000062
10 0.000020 0.000010 0.000008 0.000036 0.000131 0.000051
100 0.000167 0.000091 0.000014 0.000041 0.001212 0.000074
200 0.000449 0.000182 0.000017 0.000047 0.002532 0.000068
500 0.000777 0.000442 0.000011 0.000096 0.005090 0.000045
1000 0.001254 0.000861 0.000018 0.000112 0.010005 0.000068
2000 NaN 0.001475 0.000016 0.000204 NaN 0.000066
3000 NaN 0.002255 0.000022 0.000303 NaN 0.000068
4000 NaN 0.002925 0.000023 0.000397 NaN 0.000076
5000 NaN 0.003806 0.000025 0.000483 NaN 0.000090
10000 NaN 0.007497 0.000039 0.001244 NaN 0.000104
20000 NaN NaN 0.000070 0.002081 NaN 0.000196
50000 NaN NaN 0.000323 0.007097 NaN 0.000944
75000 NaN NaN 0.000337 0.010440 NaN 0.001285
100000 NaN NaN 0.000488 0.013821 NaN 0.001754
150000 NaN NaN 0.002686 0.020909 NaN 0.003922
200000 NaN NaN 0.003575 0.028242 NaN 0.005326
300000 NaN NaN 0.006576 0.044728 NaN 0.009469
400000 NaN NaN 0.007288 0.058541 NaN 0.011123
500000 NaN NaN 0.009223 0.072303 NaN 0.013464
600000 NaN NaN 0.010878 0.087836 NaN 0.015860
for c in piv.columns:
    piv["ave_" + c] = piv[c] / piv.index
piv
lib cython numba numpy onnxruntime python sklearn ave_cython ave_numba ave_numpy ave_onnxruntime ave_python ave_sklearn
batch
1 0.000003 0.000002 0.000008 0.000021 0.000012 0.000062 0.000003 2.378661e-06 8.147732e-06 2.094534e-05 0.000012 6.172035e-05
10 0.000020 0.000010 0.000008 0.000036 0.000131 0.000051 0.000002 1.004442e-06 7.720681e-07 3.617296e-06 0.000013 5.102210e-06
100 0.000167 0.000091 0.000014 0.000041 0.001212 0.000074 0.000002 9.147396e-07 1.386308e-07 4.106933e-07 0.000012 7.433464e-07
200 0.000449 0.000182 0.000017 0.000047 0.002532 0.000068 0.000002 9.118438e-07 8.401954e-08 2.372320e-07 0.000013 3.385394e-07
500 0.000777 0.000442 0.000011 0.000096 0.005090 0.000045 0.000002 8.841777e-07 2.119264e-08 1.924594e-07 0.000010 8.921342e-08
1000 0.001254 0.000861 0.000018 0.000112 0.010005 0.000068 0.000001 8.612776e-07 1.769872e-08 1.124766e-07 0.000010 6.787302e-08
2000 NaN 0.001475 0.000016 0.000204 NaN 0.000066 NaN 7.375451e-07 8.196917e-09 1.021833e-07 NaN 3.312071e-08
3000 NaN 0.002255 0.000022 0.000303 NaN 0.000068 NaN 7.517826e-07 7.386717e-09 1.011543e-07 NaN 2.265766e-08
4000 NaN 0.002925 0.000023 0.000397 NaN 0.000076 NaN 7.312328e-07 5.838998e-09 9.924105e-08 NaN 1.912084e-08
5000 NaN 0.003806 0.000025 0.000483 NaN 0.000090 NaN 7.612837e-07 4.947741e-09 9.664728e-08 NaN 1.803571e-08
10000 NaN 0.007497 0.000039 0.001244 NaN 0.000104 NaN 7.497284e-07 3.931249e-09 1.244355e-07 NaN 1.036304e-08
20000 NaN NaN 0.000070 0.002081 NaN 0.000196 NaN NaN 3.522461e-09 1.040586e-07 NaN 9.809558e-09
50000 NaN NaN 0.000323 0.007097 NaN 0.000944 NaN NaN 6.468370e-09 1.419331e-07 NaN 1.888264e-08
75000 NaN NaN 0.000337 0.010440 NaN 0.001285 NaN NaN 4.495871e-09 1.391996e-07 NaN 1.713105e-08
100000 NaN NaN 0.000488 0.013821 NaN 0.001754 NaN NaN 4.878072e-09 1.382130e-07 NaN 1.753515e-08
150000 NaN NaN 0.002686 0.020909 NaN 0.003922 NaN NaN 1.790808e-08 1.393904e-07 NaN 2.614948e-08
200000 NaN NaN 0.003575 0.028242 NaN 0.005326 NaN NaN 1.787713e-08 1.412113e-07 NaN 2.663233e-08
300000 NaN NaN 0.006576 0.044728 NaN 0.009469 NaN NaN 2.191992e-08 1.490931e-07 NaN 3.156392e-08
400000 NaN NaN 0.007288 0.058541 NaN 0.011123 NaN NaN 1.821957e-08 1.463514e-07 NaN 2.780641e-08
500000 NaN NaN 0.009223 0.072303 NaN 0.013464 NaN NaN 1.844603e-08 1.446063e-07 NaN 2.692782e-08
600000 NaN NaN 0.010878 0.087836 NaN 0.015860 NaN NaN 1.812999e-08 1.463938e-07 NaN 2.643390e-08
libs = list(c for c in piv.columns if "ave_" in c)
ax = piv.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch")
ax.grid(True);
../_images/cffi_linear_regression_119_0.png

Le minimum obtenu est pour 10^{-8} s soit 10 ns. Cela montre que la comparaisson précédente était incomplète voire biaisée. Tout dépend de l’usage qu’on fait de la fonction de prédiction même s’il sera toujours possible de d’écrire un code spécialisé plus rapide que toute autre fonction générique. En général, plus on reste du côté Python, plus le programme est lent. Le nombre de passage de l’un à l’autre, selon la façon dont il est fait ralenti aussi. En tenant compte de cela, le programme rouge sera plus lent que le vert.

from pyquickhelper.helpgen import NbImage
NbImage("pycpp.png")
../_images/cffi_linear_regression_121_0.png

Ces résultats sont d’une façon générale assez volatile car le temps de calcul est enrobé dans plusieurs fonctions Python qui rendent une mesure précise difficile. Il reste néanmoins une bonne idée des ordres de grandeurs.

Random Forest

On reproduit les mêmes résultats pour une random forest mais la réécriture n’est plus aussi simple qu’une régression linéaire.

Une prédiction à la fois

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(diabetes_X_train, diabetes_y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
memo_time = []
x = diabetes_X_test[:1]
memo_time.append(timeexe("sklearn-rf", "rf.predict(x)", repeat=100, number=20))
Moyenne: 598.88 µs Ecart-type 55.79 µs (with 20 runs) in [546.51 µs, 736.22 µs]

C’est beaucoup plus long que la régression linéaire. On essaye avec onnx.

if ok_onnx:
    onnxrf_model = convert_sklearn(rf, 'model', [('input', FloatTensorType([1, clr.coef_.shape[0]]))])
    save_model(onnxrf_model, 'model_rf.onnx')
    model_onnx = onnx.load('model_rf.onnx')
The maximum opset needed by this model is only 1.
if ok_onnx:
    sess = onnxruntime.InferenceSession("model_rf.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt_rf(x):
        return sess.run(["variable"], {'input': x})

    print(predict_onnxrt_rf(x.astype(numpy.float32)))
    memo_time.append(timeexe("onnx-rf", "predict_onnxrt_rf(x.astype(numpy.float32))", repeat=100, number=20))
Input: NodeArg(name='input', type='tensor(float)', shape=[1, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[1, 1])
[array([[172.3]], dtype=float32)]
Moyenne: 28.42 µs Ecart-type 7.85 µs (with 20 runs) in [18.47 µs, 47.33 µs]

C’est beaucoup plus rapide.

import pandas
df2 = pandas.DataFrame(data=memo_time)
df2 = df2.set_index("legend").sort_values("average")
df2
average code deviation first first3 last3 max5 min5 repeat run
legend
onnx-rf 0.000028 predict_onnxrt_rf(x.astype(numpy.float32)) 0.000008 0.000029 0.000027 0.000025 0.000047 0.000018 100 20
sklearn-rf 0.000599 rf.predict(x) 0.000056 0.000910 0.000801 0.000601 0.000736 0.000547 100 20
fig, ax = plt.subplots(1, 1, figsize=(14,4))
df2[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                   legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
../_images/cffi_linear_regression_133_0.png

Prédiction en batch

memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
repeat = 10
for i in batch[:15]:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)

    memo.append(timeexe("sklearn.predict %d" % i, "rf.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt_rf(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 674.08 µs Ecart-type 216.17 µs (with 10 runs) in [535.19 µs, 1.25 ms]
Moyenne: 23.35 µs Ecart-type 7.29 µs (with 10 runs) in [18.57 µs, 44.44 µs]
batch = 10
Moyenne: 603.57 µs Ecart-type 46.82 µs (with 10 runs) in [554.90 µs, 720.59 µs]
Moyenne: 48.79 µs Ecart-type 10.54 µs (with 10 runs) in [38.44 µs, 72.73 µs]
batch = 100
Moyenne: 720.38 µs Ecart-type 85.23 µs (with 10 runs) in [625.93 µs, 878.73 µs]
Moyenne: 277.19 µs Ecart-type 61.05 µs (with 10 runs) in [234.94 µs, 434.57 µs]
batch = 200
Moyenne: 754.27 µs Ecart-type 89.03 µs (with 10 runs) in [666.03 µs, 1.01 ms]
Moyenne: 469.01 µs Ecart-type 24.81 µs (with 10 runs) in [428.60 µs, 506.47 µs]
batch = 500
Moyenne: 937.39 µs Ecart-type 119.09 µs (with 10 runs) in [817.82 µs, 1.14 ms]
Moyenne: 1.14 ms Ecart-type 108.41 µs (with 10 runs) in [1.03 ms, 1.39 ms]
batch = 1000
Moyenne: 1.14 ms Ecart-type 103.66 µs (with 10 runs) in [1.01 ms, 1.38 ms]
Moyenne: 1.98 ms Ecart-type 60.76 µs (with 10 runs) in [1.89 ms, 2.12 ms]
batch = 2000
Moyenne: 1.49 ms Ecart-type 156.51 µs (with 10 runs) in [1.40 ms, 1.95 ms]
Moyenne: 3.99 ms Ecart-type 207.44 µs (with 10 runs) in [3.76 ms, 4.41 ms]
batch = 3000
Moyenne: 1.81 ms Ecart-type 112.98 µs (with 10 runs) in [1.69 ms, 2.01 ms]
Moyenne: 5.92 ms Ecart-type 213.19 µs (with 10 runs) in [5.61 ms, 6.42 ms]
batch = 4000
Moyenne: 2.17 ms Ecart-type 137.53 µs (with 10 runs) in [2.01 ms, 2.46 ms]
Moyenne: 7.76 ms Ecart-type 389.39 µs (with 10 runs) in [7.36 ms, 8.73 ms]
batch = 5000
Moyenne: 2.61 ms Ecart-type 253.06 µs (with 10 runs) in [2.37 ms, 3.05 ms]
Moyenne: 9.85 ms Ecart-type 415.63 µs (with 10 runs) in [9.49 ms, 10.63 ms]
batch = 10000
Moyenne: 4.47 ms Ecart-type 337.08 µs (with 10 runs) in [4.20 ms, 5.31 ms]
Moyenne: 20.77 ms Ecart-type 1.84 ms (with 10 runs) in [19.49 ms, 26.13 ms]
batch = 20000
Moyenne: 9.72 ms Ecart-type 896.79 µs (with 10 runs) in [8.17 ms, 11.10 ms]
Moyenne: 40.78 ms Ecart-type 4.39 ms (with 10 runs) in [36.75 ms, 47.99 ms]
batch = 50000
Moyenne: 19.34 ms Ecart-type 198.30 µs (with 10 runs) in [19.15 ms, 19.83 ms]
Moyenne: 94.49 ms Ecart-type 402.90 µs (with 10 runs) in [93.97 ms, 95.18 ms]
batch = 75000
Moyenne: 29.40 ms Ecart-type 1.49 ms (with 10 runs) in [28.36 ms, 32.51 ms]
Moyenne: 148.87 ms Ecart-type 9.32 ms (with 10 runs) in [141.62 ms, 166.68 ms]
batch = 100000
Moyenne: 39.84 ms Ecart-type 2.04 ms (with 10 runs) in [37.37 ms, 42.91 ms]
Moyenne: 305.83 ms Ecart-type 175.02 ms (with 10 runs) in [191.99 ms, 709.25 ms]
dfbrf = pandas.DataFrame(memo)[["average", "lib", "batch"]]
pivrf = dfbrf.pivot("batch", "lib", "average")
for c in pivrf.columns:
    pivrf["ave_" + c] = pivrf[c] / pivrf.index
libs = list(c for c in pivrf.columns if "ave_" in c)
ax = pivrf.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch\nrandom forest")
ax.grid(True);
../_images/cffi_linear_regression_136_0.png