Optimisation de code avec cffi, numba, cython

Links: notebook, html, PDF, python, slides, GitHub

L’idée est de recoder une fonction en C. On prend comme exemple la fonction de prédiction de la régression linéaire de scikit-learn et de prévoir le gain de temps qu’on obtient en recodant la fonction dans un langage plus rapide.

from jyquickhelper import add_notebook_menu
add_notebook_menu()
memo_time = []
import timeit

def unit(x):
    if x >= 1: return "%1.2f s" % x
    elif x >= 1e-3: return "%1.2f ms" % (x* 1000)
    elif x >= 1e-6: return "%1.2f µs" % (x* 1000**2)
    elif x >= 1e-9: return "%1.2f ns" % (x* 1000**3)
    else:
        return "%1.2g s" % x

def timeexe(legend, code, number=100, repeat=1000):
    rep = timeit.repeat(code, number=number, repeat=repeat, globals=globals())
    ave = sum(rep) / (number * repeat)
    std = (sum((x/number - ave)**2 for x in rep) / repeat)**0.5
    fir = rep[0]/number
    fir3 = sum(rep[:3]) / (3 * number)
    las3 = sum(rep[-3:]) / (3 * number)
    rep.sort()
    mini = rep[len(rep)//20] / number
    maxi = rep[-len(rep)//20] / number
    print("Moyenne: %s Ecart-type %s (with %d runs) in [%s, %s]" % (
                unit(ave), unit(std), number, unit(mini), unit(maxi)))
    return dict(legend=legend, average=ave, deviation=std, first=fir, first3=fir3,
                last3=las3, repeat=repeat, min5=mini, max5=maxi, code=code, run=number)

Régression linéaire

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(diabetes_X_train, diabetes_y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
clr.coef_
array([ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
       -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
        7.43519617e+02,  7.60951722e+01])
clr.intercept_
152.76430691633442
z = diabetes_X_test[0:1,:]
memo_time.append(timeexe("sklearn.predict", "clr.predict(z)"))
Moyenne: 48.88 µs Ecart-type 12.74 µs (with 100 runs) in [41.40 µs, 79.02 µs]
%timeit clr.predict(z)
45.6 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

optimisation avec cffi

On s’inspire de l’exemple Purely for performance (API level, out-of-line).

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg(int, double *, double *, double, double *);")

ffibuilder.set_source("_linear_regression",
r"""
    static int linreg(int dimension, double * x, double *coef, double intercept, double * out)
    {
        for(; dimension > 0; --dimension, ++x, ++coef)
            intercept += *x * *coef;
        *out = intercept;
        return 1;
    }
""")

ffibuilder.compile(verbose=True)
generating ._linear_regression.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python372_x64include -Ic:python372_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression.c /Fo.Release_linear_regression.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python372_x64libs /LIBPATH:c:python372_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression .Release_linear_regression.obj /OUT:._linear_regression.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression.cp37-win_amd64.pyd'

La fonction compilée est accessible comme suit.

from _linear_regression import ffi, lib
lib.linreg
<function _linear_regression.CompiledLib.linreg>

On s’inspire de l’exemple How to pass a Numpy array into a cffi function and how to get one back out?.

import numpy
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
x = diabetes_X_test[0:1,:]
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
n = len(clr.coef_)
lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)
1
out
array([197.61846908])

On vérifie qu’on obtient bien la même chose.

clr.predict(x)
array([197.61846908])

Et on mesure le temps d’exécution :

memo_time.append(timeexe("cffi-linreg", "lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)"))
Moyenne: 985.29 ns Ecart-type 1.39 µs (with 100 runs) in [499.00 ns, 1.65 µs]

C’est beaucoup plus rapide. Pour être totalement honnête, il faut mesurer les étapes qui consiste à extraire les pointeurs.

def predict_clr(x, clr):
    out = numpy.zeros(1)
    ptr_coef = clr.coef_.__array_interface__['data'][0]
    cptr_coef = ffi.cast ( "double*" , ptr_coef )
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "double*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "double*" , ptr_out )
    lib.linreg(len(x), cptr_x, cptr_coef, clr.intercept_, cptr_out)
    return out

predict_clr(x, clr)
array([152.74058378])
memo_time.append(timeexe("cffi-linreg-wrapped", "predict_clr(x, clr)"))
Moyenne: 8.50 µs Ecart-type 2.59 µs (with 100 runs) in [6.93 µs, 13.60 µs]

Cela reste plus rapide.

cffi - seconde version

Comme on construit la fonction en dynamique (le code est connu lors de l’exécution), on peut facilement se passer de la boucle et écrire le code sans boucle et avec les coefficients.

res = " + ".join("{0}*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'0.3034995490660432*x[0] + -237.63931533353403*x[1] + 510.5306054362253*x[2] + 327.7369804093466*x[3] + -814.1317093725387*x[4] + 492.81458798373217*x[5] + 102.8484521916802*x[6] + 184.60648905984*x[7] + 743.519616750542*x[8] + 76.09517221662392*x[9]'
code = """
    static int linreg_custom(double * x, double * out)
    {{
        out[0] = {0} + {1};
    }}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom(double * x, double * out)
{
    out[0] = 152.76430691633442 + 0.3034995490660432*x[0] + -237.63931533353403*x[1] + 510.5306054362253*x[2] + 327.7369804093466*x[3] + -814.1317093725387*x[4] + 492.81458798373217*x[5] + 102.8484521916802*x[6] + 184.60648905984*x[7] + 743.519616750542*x[8] + 76.09517221662392*x[9];
}
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom(double *, double *);")
ffibuilder.set_source("_linear_regression_custom", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python372_x64include -Ic:python372_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression_custom.c /Fo.Release_linear_regression_custom.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python372_x64libs /LIBPATH:c:python372_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression_custom .Release_linear_regression_custom.obj /OUT:._linear_regression_custom.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom.cp37-win_amd64.pyd'
from _linear_regression_custom.lib import linreg_custom
linreg_custom(cptr_x, cptr_out)
out
array([197.61846908])
memo_time.append(timeexe("cffi-linreg-custom", "linreg_custom(cptr_x, cptr_out)"))
Moyenne: 493.86 ns Ecart-type 218.74 ns (with 100 runs) in [359.00 ns, 832.00 ns]

On a gagné un facteur 2.

def predict_clr_custom(x):
    out = numpy.zeros(1)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast("double*", ptr_x)
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast("double*", ptr_out)
    linreg_custom(cptr_x, cptr_out)
    return out

predict_clr_custom(x)
array([197.61846908])
memo_time.append(timeexe("cffi-linreg-custom wrapped", "predict_clr_custom(x)"))
Moyenne: 5.98 µs Ecart-type 1.47 µs (with 100 runs) in [5.00 µs, 8.59 µs]

C’est un peu plus rapide.

et en float?

L’ordinateur fait la distinction entre les double code sur 64 bit et les float codé sur 32 bits. La précision est meilleure dans le premier cas et les calculs sont plus rapides dans le second. Dans le cas du machine learning, on préfère la rapidité à une perte précision en précision qui est souvent compensée par l’optimisation inhérente à tout problème de machine learning. Ce qu’on perd sur une observation, on le retrouve sur une autre.

res = " + ".join("{0}f*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'0.3034995490660432f*x[0] + -237.63931533353403f*x[1] + 510.5306054362253f*x[2] + 327.7369804093466f*x[3] + -814.1317093725387f*x[4] + 492.81458798373217f*x[5] + 102.8484521916802f*x[6] + 184.60648905984f*x[7] + 743.519616750542f*x[8] + 76.09517221662392f*x[9]'
code = """
    static int linreg_custom_float(float * x, float * out)
    {{
        out[0] = {0}f + {1};
    }}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom_float(float * x, float * out)
{
    out[0] = 152.76430691633442f + 0.3034995490660432f*x[0] + -237.63931533353403f*x[1] + 510.5306054362253f*x[2] + 327.7369804093466f*x[3] + -814.1317093725387f*x[4] + 492.81458798373217f*x[5] + 102.8484521916802f*x[6] + 184.60648905984f*x[7] + 743.519616750542f*x[8] + 76.09517221662392f*x[9];
}
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python372_x64include -Ic:python372_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression_custom_float.c /Fo.Release_linear_regression_custom_float.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python372_x64libs /LIBPATH:c:python372_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression_custom_float .Release_linear_regression_custom_float.obj /OUT:._linear_regression_custom_float.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float.cp37-win_amd64.pyd'
from _linear_regression_custom_float.lib import linreg_custom_float
def predict_clr_custom_float(x):
    out = numpy.zeros(1, dtype=numpy.float32)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "float*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "float*" , ptr_out )
    linreg_custom_float(cptr_x, cptr_out)
    return out

Avant d’appeler la fonction, on doit transformer le vecteur iniatial en float32.

x32 = x.astype(numpy.float32)
predict_clr_custom(x32)
array([152.76430692])
memo_time.append(timeexe("cffi-linreg-custom-float wrapped", "predict_clr_custom(x32)"))
Moyenne: 6.37 µs Ecart-type 2.71 µs (with 100 runs) in [5.09 µs, 11.23 µs]

La différence n’est pas flagrante. Mesurons le code C uniquement même si la partie Python ne peut pas être complètement évitée.

out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

memo_time.append(timeexe("cffi-linreg-custom-float32", "linreg_custom_float(cptr_x, cptr_out)"))
Moyenne: 532.88 ns Ecart-type 521.04 ns (with 100 runs) in [368.00 ns, 928.00 ns]

La différence n’est pas significative.

SIMD

C’est un ensemble d’instructions processeur pour faire des opérations terme à terme sur 4 float32 aussi rapidement qu’une seule. Le processeur ne peut faire des opérations que les nombres sont copiés dans ses registres. Le programme passe alors son temps à copier des nombres depuis la mémoire vers les registres du processeur puis à faire la copie dans le chemin inverse pour le résultat. Les instructions SIMD font gagner du temps du niveau du calcul. Au lieu de faire 4 opérations de multiplication terme à terme, il n’en fait plus qu’une. Il suffit de savoir comment utiliser ces instructions. Avec Visual Studio, elles sont accessible via ces fonctions Memory and Initialization Using Streaming SIMD Extensions. Le code suivant n’est probablement pas optimal mais il n’est pas trop compliqué à suivre.

code = """
#include <xmmintrin.h>

static int linreg_custom_float_simd(float * x, float * out)
{
    __m128 c1 = _mm_set_ps(0.3034995490664121f, -237.63931533353392f, 510.5306054362245f, 327.7369804093466f);
    __m128 c2 = _mm_set_ps(-814.1317093725389f, 492.81458798373245f, 102.84845219168025f, 184.60648905984064f);
    __m128 r1 = _mm_set_ss(152.76430691633442f);
    r1 = _mm_add_ss(r1, _mm_mul_ps(c1, _mm_load_ps(x)));
    r1 = _mm_add_ss(r1, _mm_mul_ps(c2, _mm_load_ps(x+4)));
    float r[4];
    _mm_store_ps(r, r1);
    out[0] = r[0] + r[1] + r[2] + r[3] + 743.5196167505419f * x[8] + 76.095172216624f * x[9];
    return 1;
}
"""
from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float_simd(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float_simd", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float_simd.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float_simd' extension
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Ic:python372_x64include -Ic:python372_x64include "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFCinclude" "-IC:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023include" "-IC:Program Files (x86)Windows KitsNETFXSDK4.6.1includeum" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0ucrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0shared" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0um" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0winrt" "-IC:Program Files (x86)Windows Kits10include10.0.17763.0cppwinrt" /Tc_linear_regression_custom_float_simd.c /Fo.Release_linear_regression_custom_float_simd.obj
C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:c:python372_x64libs /LIBPATH:c:python372_x64PCbuildamd64 "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023ATLMFClibx64" "/LIBPATH:C:Program Files (x86)Microsoft Visual Studio2017CommunityVCToolsMSVC14.16.27023libx64" "/LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.6.1libumx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0ucrtx64" "/LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.17763.0umx64" /EXPORT:PyInit__linear_regression_custom_float_simd .Release_linear_regression_custom_float_simd.obj /OUT:._linear_regression_custom_float_simd.cp37-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float_simd.cp37-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float_simd.cp37-win_amd64.pyd'
from _linear_regression_custom_float_simd.lib import linreg_custom_float_simd
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

linreg_custom_float_simd(cptr_x, cptr_out)
out
array([171.1178], dtype=float32)
memo_time.append(timeexe("cffi-linreg-custom-float32-simd", "linreg_custom_float_simd(cptr_x, cptr_out)"))
Moyenne: 455.51 ns Ecart-type 243.42 ns (with 100 runs) in [335.00 ns, 787.00 ns]

C’est légèrement mieux, quelques références :

Les processeurs évoluent au fil du temps, 4 float, 8 float, SIMD2, FMA4 Intrinsics Added for Visual Studio 2010 SP1, AVX.

Réécriture purement Python

On continue avec uniquement du Python sans numpy.

coef = clr.coef_
list(coef)
[0.3034995490660432,
 -237.63931533353403,
 510.5306054362253,
 327.7369804093466,
 -814.1317093725387,
 492.81458798373217,
 102.8484521916802,
 184.60648905984,
 743.519616750542,
 76.09517221662392]
code = str(clr.intercept_) + "+" + "+".join("x[{0}]*({1})".format(i, c) for i, c in enumerate(coef))
code
'152.76430691633442+x[0]*(0.3034995490660432)+x[1]*(-237.63931533353403)+x[2]*(510.5306054362253)+x[3]*(327.7369804093466)+x[4]*(-814.1317093725387)+x[5]*(492.81458798373217)+x[6]*(102.8484521916802)+x[7]*(184.60648905984)+x[8]*(743.519616750542)+x[9]*(76.09517221662392)'
def predict_clr_python(x):
    return 152.764306916+x[0]*0.3034995490664121+x[1]*(-237.63931533353392)+x[2]*510.5306054362245+ \
            x[3]*327.7369804093466+ \
            x[4]*(-814.1317093725389)+x[5]*492.81458798373245+x[6]*102.84845219168025+ \
            x[7]*184.60648905984064+x[8]*743.5196167505419+x[9]*76.095172216624

predict_clr_python(x[0])
197.61846907469848
z = list(x[0])
memo_time.append(timeexe("python-linreg-custom", "predict_clr_python(z)"))
Moyenne: 3.40 µs Ecart-type 1.20 µs (with 100 runs) in [2.69 µs, 5.91 µs]

De façon assez surprenante, c’est plutôt rapide. Et si on y mettait une boucle.

def predict_clr_python_loop(x, coef, intercept):
    return intercept + sum(a*b for a, b in zip(x, coef))

predict_clr_python_loop(x[0], list(clr.coef_), clr.intercept_)
197.61846907503298
coef = list(clr.coef_)
intercept = clr.intercept_
memo_time.append(timeexe("python-linreg", "predict_clr_python_loop(z, coef, intercept)"))
Moyenne: 5.43 µs Ecart-type 1.68 µs (with 100 runs) in [4.37 µs, 8.24 µs]

A peine plus long.

Réécriture avec Python et numpy

def predict_clr_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numpy(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numpy-linreg-numpy", "predict_clr_numpy(z, coef, clr.intercept_)"))
Moyenne: 9.53 µs Ecart-type 2.24 µs (with 100 runs) in [8.34 µs, 12.44 µs]

Les dimensions des tableaux sont trop petites pour que le calcul matriciel apporte une différence. On se retrouve dans le cas cffi où les échanges Python - C grignotent tout le temps de calcul.

numba

numba essaye de compiler à la volée des bouts de codes écrits en Python. On induque quelle fonction optimiser en faisant précéder la fonction de @jit. Toutes les écritures ne fonctionnent, typiquement, certaines listes en compréhension soulèvent une exception. Il faut donc écrire son code en Python d’une façon assez proche de ce qu’il serait en C.

from numba import jit
@jit
def predict_clr_numba(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba(z, clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-notype", "predict_clr_numba(z, clr.coef_, clr.intercept_)"))
Moyenne: 29.38 µs Ecart-type 9.80 µs (with 100 runs) in [23.60 µs, 49.54 µs]

Plutôt rapide !

@jit('double(double[:], double[:], double)')
def predict_clr_numba_cast(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-type", "predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 1.04 µs Ecart-type 470.76 ns (with 100 runs) in [835.00 ns, 1.89 µs]

On voit que plus on donne d’information au compilateur, plus il est capable d’optimiser.

@jit('float32(float32[:], float32[:], float32)')
def predict_clr_numba_cast_float(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
x32 = x[0].astype(numpy.float32)
c32 = clr.coef_.astype(numpy.float32)
i32 = numpy.float32(clr.intercept_)
predict_clr_numba_cast_float(x32, c32, i32)
197.61846923828125
memo_time.append(timeexe("numba-linreg-type-float32", "predict_clr_numba_cast_float(x32, c32, i32)"))
Moyenne: 793.32 ns Ecart-type 686.11 ns (with 100 runs) in [577.00 ns, 1.54 µs]

On essaye avec les coefficients dans la fonction.

@jit('double(double[:])')
def predict_clr_numba_cast_custom(x):
    coef = [ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
            -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
             7.43519617e+02,  7.60951722e+01]
    s = 152.76430691633442
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba_cast_custom(x[0])
197.61846907190048
memo_time.append(timeexe("numba-linreg-type-custom", "predict_clr_numba_cast_custom(x[0])"))
Moyenne: 1.04 µs Ecart-type 615.32 ns (with 100 runs) in [718.00 ns, 1.94 µs]

On se rapproche des temps obtenus avec cffi sans wrapping, cela signifie que numba fait un bien meilleur travail à ce niveau que le wrapper rapidement créé.

@jit('double(double[:], double[:], double)')
def predict_clr_numba_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("numba-linreg-type-numpy", "predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 8.54 µs Ecart-type 2.29 µs (with 100 runs) in [7.21 µs, 11.78 µs]

numba est moins performant quand numpy est impliqué car le code de numpy n’est pas réécrit, il est appelé.

cython

cython permet de créer des extensions C de plus grande envergure que numba. C’est l’option choisie par scikit-learn. Il vaut mieux connaître le C pour s’en servir et là encore, l’objectif est de réduire les échanges Python / C qui coûtent cher.

%load_ext cython
%%cython
def predict_clr_cython(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s
predict_clr_cython(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("cython-linreg", "predict_clr_cython(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 4.33 µs Ecart-type 697.40 ns (with 100 runs) in [3.76 µs, 5.54 µs]

Cython fait moins bien que numba dans notre cas et l’optimisation proposée est assez proche du temps déjà obtenue avec le langage Python seul. Cela est dû au fait que la plupart des objets tels que du code associé aux listes ou aux dictionnaires ont été réécrits en C.

%%cython
cimport numpy as npc

def predict_clr_cython_type(npc.ndarray[double, ndim=1, mode='c'] x,
                            npc.ndarray[double, ndim=1, mode='c'] coef,
                            double intercept):
    cdef double s = intercept
    for i in range(0, x.shape[0]):
        s += coef[i] * x[i]
    return s
predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)
197.61846907503298
memo_time.append(timeexe("cython-linreg-type", "predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 1.70 µs Ecart-type 603.22 ns (with 100 runs) in [1.42 µs, 2.93 µs]

Le temps est quasi identique avec un écart type moins grand de façon significative.

Une dernière option : ONNX

ONNX est un format de sérialisation qui permet de décrire un modèle de modèle de machine learning ou de deep learning. Cela permet de dissocer le modèle de la librairie qui a servi à le produire (voir ML.net and ONNX).

try:
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType
    import onnxruntime
    import onnx
    ok_onnx = True
    print("onnx, skl2onnx, onnxruntime sont disponibles.")

    def save_model(onnx_model, filename):
        with open(filename, "wb") as f:
            f.write(onnx_model.SerializeToString())
except ImportError as e:
    print("La suite requiert onnx, skl2onnx et onnxruntime.")
    print(e)
    ok_onnx = False
onnx, skl2onnx, onnxruntime sont disponibles.

On convertit le modèle au format ONNX.

if ok_onnx:
    onnx_model = convert_sklearn(
        clr, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))])
    save_model(onnx_model, 'model.onnx')

    model_onnx = onnx.load('model.onnx')
    print("Modèle sérialisé au format ONNX")
    print(model_onnx)
else:
    print("onnx, onnxmltools, onnxruntime sont disponibles.")
Modèle sérialisé au format ONNX
ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.5.999996"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
  node {
    input: "input"
    output: "variable"
    name: "LinearRegressor"
    op_type: "LinearRegressor"
    attribute {
      name: "coefficients"
      floats: 0.3034995496273041
      floats: -237.63931274414062
      floats: 510.5306091308594
      floats: 327.7369689941406
      floats: -814.1317138671875
      floats: 492.8145751953125
      floats: 102.84844970703125
      floats: 184.6064910888672
      floats: 743.5195922851562
      floats: 76.09516906738281
      type: FLOATS
    }
    attribute {
      name: "intercepts"
      floats: 152.76431274414062
      type: FLOATS
    }
    domain: "ai.onnx.ml"
  }
  name: "model"
  input {
    name: "input"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 10
          }
        }
      }
    }
  }
  output {
    name: "variable"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 1
          }
        }
      }
    }
  }
}
opset_import {
  domain: "ai.onnx.ml"
  version: 1
}

On calcule les prédictions. Le module {onnxruntime](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) optimise les calculs pour des modèles de deep learning. Cela explique pourquoi tous les calculs sont réalisés avec des réels représentés sur 4 octets numpy.float32.

if ok_onnx:
    sess = onnxruntime.InferenceSession("model.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt(x):
        return sess.run(["variable"], {'input': x})

    print("Prediction:", predict_onnxrt(x.astype(numpy.float32)))
Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1])
Prediction: [array([[197.61847]], dtype=float32)]
if ok_onnx:
    x32 = x.astype(numpy.float32)
    memo_time.append(timeexe("onnxruntime-float32", "predict_onnxrt(x32)"))
    memo_time.append(timeexe("onnxruntime-float64", "predict_onnxrt(x.astype(numpy.float32))"))
Moyenne: 13.13 µs Ecart-type 7.59 µs (with 100 runs) in [9.34 µs, 24.25 µs]
Moyenne: 12.75 µs Ecart-type 3.12 µs (with 100 runs) in [10.75 µs, 20.82 µs]

Récapitulatif

import pandas
df = pandas.DataFrame(data=memo_time)
df = df.set_index("legend").sort_values("average")
df
average deviation first first3 last3 repeat min5 max5 code run
legend
cffi-linreg-custom-float32-simd 4.555090e-07 2.434230e-07 6.150000e-07 6.093333e-07 3.370000e-07 1000 3.350000e-07 7.870000e-07 linreg_custom_float_simd(cptr_x, cptr_out) 100
cffi-linreg-custom 4.938620e-07 2.187442e-07 1.265000e-06 6.626667e-07 3.600000e-07 1000 3.590000e-07 8.320000e-07 linreg_custom(cptr_x, cptr_out) 100
cffi-linreg-custom-float32 5.328840e-07 5.210403e-07 7.577000e-06 3.614000e-06 3.693333e-07 1000 3.680000e-07 9.280000e-07 linreg_custom_float(cptr_x, cptr_out) 100
numba-linreg-type-float32 7.933220e-07 6.861051e-07 7.070000e-07 6.353333e-07 5.800000e-07 1000 5.770000e-07 1.539000e-06 predict_clr_numba_cast_float(x32, c32, i32) 100
cffi-linreg 9.852930e-07 1.391556e-06 3.222000e-06 3.122333e-06 5.813333e-07 1000 4.990000e-07 1.645000e-06 lib.linreg(n, cptr_x, cptr_coef, clr.intercept... 100
numba-linreg-type 1.041028e-06 4.707631e-07 1.128000e-06 9.326667e-07 8.393333e-07 1000 8.350000e-07 1.890000e-06 predict_clr_numba_cast(x[0], clr.coef_, clr.in... 100
numba-linreg-type-custom 1.043290e-06 6.153153e-07 8.910000e-07 9.123333e-07 7.183333e-07 1000 7.180000e-07 1.945000e-06 predict_clr_numba_cast_custom(x[0]) 100
cython-linreg-type 1.699581e-06 6.032213e-07 3.837000e-06 4.730333e-06 1.546667e-06 1000 1.415000e-06 2.928000e-06 predict_clr_cython_type(x[0], clr.coef_, clr.i... 100
python-linreg-custom 3.398032e-06 1.199968e-06 3.790000e-06 4.507667e-06 2.915667e-06 1000 2.690000e-06 5.906000e-06 predict_clr_python(z) 100
cython-linreg 4.325581e-06 6.973983e-07 4.418000e-06 4.213333e-06 4.037333e-06 1000 3.764000e-06 5.543000e-06 predict_clr_cython(x[0], clr.coef_, clr.interc... 100
python-linreg 5.429431e-06 1.680433e-06 9.941000e-06 1.139767e-05 4.493333e-06 1000 4.367000e-06 8.237000e-06 predict_clr_python_loop(z, coef, intercept) 100
cffi-linreg-custom wrapped 5.977490e-06 1.474244e-06 1.025300e-05 8.704667e-06 5.141667e-06 1000 5.003000e-06 8.593000e-06 predict_clr_custom(x) 100
cffi-linreg-custom-float wrapped 6.365030e-06 2.707499e-06 1.443800e-05 1.581333e-05 5.183000e-06 1000 5.094000e-06 1.123300e-05 predict_clr_custom(x32) 100
cffi-linreg-wrapped 8.499981e-06 2.593850e-06 1.930100e-05 2.301433e-05 9.639667e-06 1000 6.925000e-06 1.360000e-05 predict_clr(x, clr) 100
numba-linreg-type-numpy 8.542519e-06 2.292600e-06 1.114100e-05 1.070200e-05 8.175333e-06 1000 7.213000e-06 1.178300e-05 predict_clr_numba_numpy(x[0], clr.coef_, clr.i... 100
numpy-linreg-numpy 9.529993e-06 2.238715e-06 2.467000e-05 2.012967e-05 9.383667e-06 1000 8.343000e-06 1.244300e-05 predict_clr_numpy(z, coef, clr.intercept_) 100
onnxruntime-float64 1.274827e-05 3.123604e-06 2.859500e-05 2.096967e-05 1.121667e-05 1000 1.075400e-05 2.081900e-05 predict_onnxrt(x.astype(numpy.float32)) 100
onnxruntime-float32 1.313077e-05 7.585132e-06 2.746100e-05 2.406433e-05 1.211433e-05 1000 9.342000e-06 2.424900e-05 predict_onnxrt(x32) 100
numba-linreg-notype 2.937788e-05 9.797700e-06 2.517300e-05 5.948667e-05 3.185400e-05 1000 2.360400e-05 4.954300e-05 predict_clr_numba(z, clr.coef_, clr.intercept_) 100
sklearn.predict 4.888248e-05 1.273852e-05 1.063140e-04 1.297223e-04 4.166000e-05 1000 4.140200e-05 7.901700e-05 clr.predict(z) 100

On enlève quelques colonnes et on rappelle :

  • cffi: signifie optimisé avec cffi

  • custom: pas de boucle mais la fonction ne peut prédire qu’une seule régression linéaire

  • float32: utilise des float et non des double

  • linreg: régression linéaire

  • numba: optimisation avec numba

  • numpy: optimisation avec numpy

  • python: pas de C, que du python

  • simd: optimisé avec les instructions SIMD

  • sklearn: fonction sklearn.predict

  • static: la fonction utilise des variables statiques

  • type: la fonction est typée et ne fonctionne qu’avec un type précis en entrée.

  • wrapped: code optimisé mais embabllé dans une fonction Python qui elle ne l’est pas (les containers sont recréés à chaque fois)

cols = ["average", "deviation", "min5", "max5", "run", "code"]
df[cols]
average deviation min5 max5 run code
legend
cffi-linreg-custom-float32-simd 4.555090e-07 2.434230e-07 3.350000e-07 7.870000e-07 100 linreg_custom_float_simd(cptr_x, cptr_out)
cffi-linreg-custom 4.938620e-07 2.187442e-07 3.590000e-07 8.320000e-07 100 linreg_custom(cptr_x, cptr_out)
cffi-linreg-custom-float32 5.328840e-07 5.210403e-07 3.680000e-07 9.280000e-07 100 linreg_custom_float(cptr_x, cptr_out)
numba-linreg-type-float32 7.933220e-07 6.861051e-07 5.770000e-07 1.539000e-06 100 predict_clr_numba_cast_float(x32, c32, i32)
cffi-linreg 9.852930e-07 1.391556e-06 4.990000e-07 1.645000e-06 100 lib.linreg(n, cptr_x, cptr_coef, clr.intercept...
numba-linreg-type 1.041028e-06 4.707631e-07 8.350000e-07 1.890000e-06 100 predict_clr_numba_cast(x[0], clr.coef_, clr.in...
numba-linreg-type-custom 1.043290e-06 6.153153e-07 7.180000e-07 1.945000e-06 100 predict_clr_numba_cast_custom(x[0])
cython-linreg-type 1.699581e-06 6.032213e-07 1.415000e-06 2.928000e-06 100 predict_clr_cython_type(x[0], clr.coef_, clr.i...
python-linreg-custom 3.398032e-06 1.199968e-06 2.690000e-06 5.906000e-06 100 predict_clr_python(z)
cython-linreg 4.325581e-06 6.973983e-07 3.764000e-06 5.543000e-06 100 predict_clr_cython(x[0], clr.coef_, clr.interc...
python-linreg 5.429431e-06 1.680433e-06 4.367000e-06 8.237000e-06 100 predict_clr_python_loop(z, coef, intercept)
cffi-linreg-custom wrapped 5.977490e-06 1.474244e-06 5.003000e-06 8.593000e-06 100 predict_clr_custom(x)
cffi-linreg-custom-float wrapped 6.365030e-06 2.707499e-06 5.094000e-06 1.123300e-05 100 predict_clr_custom(x32)
cffi-linreg-wrapped 8.499981e-06 2.593850e-06 6.925000e-06 1.360000e-05 100 predict_clr(x, clr)
numba-linreg-type-numpy 8.542519e-06 2.292600e-06 7.213000e-06 1.178300e-05 100 predict_clr_numba_numpy(x[0], clr.coef_, clr.i...
numpy-linreg-numpy 9.529993e-06 2.238715e-06 8.343000e-06 1.244300e-05 100 predict_clr_numpy(z, coef, clr.intercept_)
onnxruntime-float64 1.274827e-05 3.123604e-06 1.075400e-05 2.081900e-05 100 predict_onnxrt(x.astype(numpy.float32))
onnxruntime-float32 1.313077e-05 7.585132e-06 9.342000e-06 2.424900e-05 100 predict_onnxrt(x32)
numba-linreg-notype 2.937788e-05 9.797700e-06 2.360400e-05 4.954300e-05 100 predict_clr_numba(z, clr.coef_, clr.intercept_)
sklearn.predict 4.888248e-05 1.273852e-05 4.140200e-05 7.901700e-05 100 clr.predict(z)
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(14,6))
df[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                  legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
../_images/cffi_linear_regression_109_0.png

Il manque à ce comparatif le GPU mais c’est un peu plus complexe à mettre en oeuvre, il faut une carte GPU et la parallélisation n’apporterait pas énormément compte tenu de la faible dimension du problème.

Prédiction one-off et biais de mesure

Le graphique précédent montre que la fonction predict de scikit-learn est la plus lente. La première raison est que ce code est valable pour toutes les régresssions linéaires alors que toutes les autres fonctions sont spécialisées pour un seul modèle. La seconde raison est que le code de scikit-learn est optimisé pour le calcul de plusieurs prédictions à la fois alors que toutes les autres fonctions n’en calcule qu’une seule (scénario dit one-off). On compare à ce que donnerait unev version purement python et numpy.

def predict_clr_python_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.zeros((x.shape[0], 1))
    res[:, 0] = intercept
    for i in range(0, x.shape[0]):
        res[i, 0] += sum(a*b for a, b in zip(x[i, :], coef))
    return res

predict_clr_python_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[197.61846908],
       [155.43979328]])
def predict_clr_numpy_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.ones((x.shape[0], 1)) * intercept
    res += x @ coef.reshape((len(coef), 1))
    return res

predict_clr_numpy_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[197.61846908],
       [155.43979328]])
def predict_clr_numba_cast_multi(X, coef, intercept):
    return [predict_clr_numba_cast(x, coef, intercept) for x in X]

predict_clr_numba_cast_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[197.61846907503298, 155.43979327521237]
def predict_clr_cython_type_multi(X, coef, intercept):
    return [predict_clr_cython_type(x, coef, intercept) for x in X]

predict_clr_cython_type_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[197.61846907503298, 155.43979327521237]
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
for i in batch:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)
    repeat=20 if i >= 5000 else 100

    memo.append(timeexe("sklearn.predict %d" % i, "clr.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if i <= 1000:
        # très lent
        memo.append(timeexe("python %d" % i, "predict_clr_python_loop_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=20, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "python"

    memo.append(timeexe("numpy %d" % i, "predict_clr_numpy_loop_multi(mx, clr.coef_, clr.intercept_)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "numpy"

    if i <= 10000:
        # très lent
        memo.append(timeexe("numba %d" % i, "predict_clr_numba_cast_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "numba"

    if i <= 1000:
        # très lent
        memo.append(timeexe("cython %d" % i, "predict_clr_cython_type_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "cython"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 50.35 µs Ecart-type 23.49 µs (with 10 runs) in [40.27 µs, 79.36 µs]
Moyenne: 9.94 µs Ecart-type 1.25 µs (with 10 runs) in [9.40 µs, 14.17 µs]
Moyenne: 9.97 µs Ecart-type 3.45 µs (with 10 runs) in [8.65 µs, 13.33 µs]
Moyenne: 2.37 µs Ecart-type 334.38 ns (with 10 runs) in [2.28 µs, 3.68 µs]
Moyenne: 2.99 µs Ecart-type 608.86 ns (with 10 runs) in [2.72 µs, 4.30 µs]
Moyenne: 12.64 µs Ecart-type 4.66 µs (with 10 runs) in [10.74 µs, 22.58 µs]
batch = 10
Moyenne: 47.90 µs Ecart-type 10.74 µs (with 10 runs) in [40.60 µs, 69.84 µs]
Moyenne: 91.47 µs Ecart-type 21.35 µs (with 10 runs) in [76.69 µs, 162.05 µs]
Moyenne: 9.04 µs Ecart-type 2.60 µs (with 10 runs) in [8.34 µs, 12.59 µs]
Moyenne: 10.53 µs Ecart-type 3.26 µs (with 10 runs) in [8.89 µs, 15.77 µs]
Moyenne: 15.71 µs Ecart-type 3.38 µs (with 10 runs) in [14.42 µs, 20.20 µs]
Moyenne: 13.88 µs Ecart-type 4.31 µs (with 10 runs) in [11.91 µs, 21.98 µs]
batch = 100
Moyenne: 57.76 µs Ecart-type 16.27 µs (with 10 runs) in [47.91 µs, 77.18 µs]
Moyenne: 885.09 µs Ecart-type 131.30 µs (with 10 runs) in [775.95 µs, 1.35 ms]
Moyenne: 10.77 µs Ecart-type 7.87 µs (with 10 runs) in [8.48 µs, 25.84 µs]
Moyenne: 90.62 µs Ecart-type 33.39 µs (with 10 runs) in [73.73 µs, 173.64 µs]
Moyenne: 144.72 µs Ecart-type 15.74 µs (with 10 runs) in [132.52 µs, 178.49 µs]
Moyenne: 28.30 µs Ecart-type 11.68 µs (with 10 runs) in [22.42 µs, 56.08 µs]
batch = 200
Moyenne: 60.08 µs Ecart-type 15.13 µs (with 10 runs) in [50.28 µs, 90.78 µs]
Moyenne: 1.77 ms Ecart-type 195.29 µs (with 10 runs) in [1.60 ms, 2.24 ms]
Moyenne: 11.61 µs Ecart-type 6.22 µs (with 10 runs) in [9.39 µs, 19.78 µs]
Moyenne: 187.73 µs Ecart-type 38.67 µs (with 10 runs) in [150.68 µs, 257.26 µs]
Moyenne: 301.04 µs Ecart-type 33.93 µs (with 10 runs) in [269.85 µs, 365.77 µs]
Moyenne: 37.89 µs Ecart-type 9.72 µs (with 10 runs) in [33.59 µs, 52.36 µs]
batch = 500
Moyenne: 51.65 µs Ecart-type 10.25 µs (with 10 runs) in [46.24 µs, 67.69 µs]
Moyenne: 4.50 ms Ecart-type 530.44 µs (with 10 runs) in [4.02 ms, 6.08 ms]
Moyenne: 12.14 µs Ecart-type 5.46 µs (with 10 runs) in [11.00 µs, 14.76 µs]
Moyenne: 463.39 µs Ecart-type 111.25 µs (with 10 runs) in [377.28 µs, 719.88 µs]
Moyenne: 719.70 µs Ecart-type 59.80 µs (with 10 runs) in [670.67 µs, 842.77 µs]
Moyenne: 80.06 µs Ecart-type 21.94 µs (with 10 runs) in [66.55 µs, 144.40 µs]
batch = 1000
Moyenne: 71.72 µs Ecart-type 24.80 µs (with 10 runs) in [54.10 µs, 116.16 µs]
Moyenne: 8.54 ms Ecart-type 553.03 µs (with 10 runs) in [8.12 ms, 10.20 ms]
Moyenne: 16.38 µs Ecart-type 6.89 µs (with 10 runs) in [15.04 µs, 18.62 µs]
Moyenne: 891.24 µs Ecart-type 114.73 µs (with 10 runs) in [782.39 µs, 1.09 ms]
Moyenne: 1.57 ms Ecart-type 277.73 µs (with 10 runs) in [1.37 ms, 2.40 ms]
Moyenne: 138.39 µs Ecart-type 37.89 µs (with 10 runs) in [121.21 µs, 260.11 µs]
batch = 2000
Moyenne: 75.68 µs Ecart-type 30.55 µs (with 10 runs) in [61.40 µs, 172.43 µs]
Moyenne: 21.46 µs Ecart-type 7.02 µs (with 10 runs) in [17.78 µs, 35.19 µs]
Moyenne: 2.22 ms Ecart-type 507.56 µs (with 10 runs) in [1.69 ms, 3.17 ms]
Moyenne: 299.72 µs Ecart-type 85.13 µs (with 10 runs) in [233.49 µs, 506.91 µs]
batch = 3000
Moyenne: 82.98 µs Ecart-type 20.79 µs (with 10 runs) in [68.78 µs, 139.49 µs]
Moyenne: 25.64 µs Ecart-type 8.20 µs (with 10 runs) in [21.03 µs, 49.53 µs]
Moyenne: 2.67 ms Ecart-type 409.03 µs (with 10 runs) in [2.32 ms, 3.65 ms]
Moyenne: 404.86 µs Ecart-type 53.73 µs (with 10 runs) in [353.84 µs, 530.68 µs]
batch = 4000
Moyenne: 96.00 µs Ecart-type 19.23 µs (with 10 runs) in [76.39 µs, 138.20 µs]
Moyenne: 29.33 µs Ecart-type 7.03 µs (with 10 runs) in [23.92 µs, 41.40 µs]
Moyenne: 4.02 ms Ecart-type 932.71 µs (with 10 runs) in [3.12 ms, 5.86 ms]
Moyenne: 513.18 µs Ecart-type 58.34 µs (with 10 runs) in [465.11 µs, 628.91 µs]
batch = 5000
Moyenne: 98.52 µs Ecart-type 19.78 µs (with 10 runs) in [80.50 µs, 138.15 µs]
Moyenne: 41.03 µs Ecart-type 6.99 µs (with 10 runs) in [38.43 µs, 70.88 µs]
Moyenne: 4.64 ms Ecart-type 737.53 µs (with 10 runs) in [3.99 ms, 7.57 ms]
Moyenne: 692.20 µs Ecart-type 93.68 µs (with 10 runs) in [600.67 µs, 956.01 µs]
batch = 10000
Moyenne: 142.41 µs Ecart-type 39.43 µs (with 10 runs) in [110.04 µs, 236.68 µs]
Moyenne: 48.44 µs Ecart-type 14.84 µs (with 10 runs) in [38.57 µs, 99.60 µs]
Moyenne: 8.80 ms Ecart-type 711.78 µs (with 10 runs) in [7.95 ms, 10.70 ms]
Moyenne: 1.37 ms Ecart-type 232.85 µs (with 10 runs) in [1.16 ms, 1.90 ms]
batch = 20000
Moyenne: 214.34 µs Ecart-type 35.58 µs (with 10 runs) in [184.11 µs, 298.01 µs]
Moyenne: 91.49 µs Ecart-type 13.14 µs (with 10 runs) in [64.97 µs, 107.83 µs]
Moyenne: 2.74 ms Ecart-type 426.36 µs (with 10 runs) in [2.41 ms, 4.06 ms]
batch = 50000
Moyenne: 848.59 µs Ecart-type 126.64 µs (with 10 runs) in [736.89 µs, 1.10 ms]
Moyenne: 449.14 µs Ecart-type 213.65 µs (with 10 runs) in [218.42 µs, 1.05 ms]
Moyenne: 8.82 ms Ecart-type 974.88 µs (with 10 runs) in [8.16 ms, 11.62 ms]
batch = 75000
Moyenne: 1.53 ms Ecart-type 257.07 µs (with 10 runs) in [1.23 ms, 2.03 ms]
Moyenne: 402.40 µs Ecart-type 58.03 µs (with 10 runs) in [344.18 µs, 548.16 µs]
Moyenne: 12.99 ms Ecart-type 897.92 µs (with 10 runs) in [12.37 ms, 16.15 ms]
batch = 100000
Moyenne: 2.09 ms Ecart-type 361.05 µs (with 10 runs) in [1.65 ms, 2.80 ms]
Moyenne: 578.85 µs Ecart-type 128.11 µs (with 10 runs) in [475.55 µs, 888.45 µs]
Moyenne: 17.41 ms Ecart-type 925.97 µs (with 10 runs) in [16.55 ms, 20.79 ms]
batch = 150000
Moyenne: 3.97 ms Ecart-type 362.63 µs (with 10 runs) in [3.66 ms, 5.27 ms]
Moyenne: 2.96 ms Ecart-type 367.98 µs (with 10 runs) in [2.67 ms, 3.98 ms]
Moyenne: 26.08 ms Ecart-type 1.16 ms (with 10 runs) in [24.98 ms, 28.89 ms]
batch = 200000
Moyenne: 6.32 ms Ecart-type 1.00 ms (with 10 runs) in [5.17 ms, 8.88 ms]
Moyenne: 4.46 ms Ecart-type 393.71 µs (with 10 runs) in [3.98 ms, 5.52 ms]
Moyenne: 37.52 ms Ecart-type 2.93 ms (with 10 runs) in [33.46 ms, 44.12 ms]
batch = 300000
Moyenne: 8.16 ms Ecart-type 665.57 µs (with 10 runs) in [7.38 ms, 10.13 ms]
Moyenne: 6.27 ms Ecart-type 496.69 µs (with 10 runs) in [5.68 ms, 7.76 ms]
Moyenne: 58.99 ms Ecart-type 10.29 ms (with 10 runs) in [52.32 ms, 94.43 ms]
batch = 400000
Moyenne: 11.32 ms Ecart-type 1.30 ms (with 10 runs) in [9.81 ms, 15.90 ms]
Moyenne: 8.24 ms Ecart-type 1.46 ms (with 10 runs) in [7.16 ms, 12.86 ms]
Moyenne: 70.63 ms Ecart-type 4.47 ms (with 10 runs) in [66.20 ms, 84.17 ms]
batch = 500000
Moyenne: 12.85 ms Ecart-type 676.87 µs (with 10 runs) in [12.20 ms, 14.35 ms]
Moyenne: 9.43 ms Ecart-type 459.49 µs (with 10 runs) in [8.97 ms, 10.61 ms]
Moyenne: 85.97 ms Ecart-type 2.29 ms (with 10 runs) in [82.79 ms, 90.99 ms]
batch = 600000
Moyenne: 17.03 ms Ecart-type 1.72 ms (with 10 runs) in [15.52 ms, 22.10 ms]
Moyenne: 11.96 ms Ecart-type 732.75 µs (with 10 runs) in [11.01 ms, 13.59 ms]
Moyenne: 104.53 ms Ecart-type 4.12 ms (with 10 runs) in [100.67 ms, 116.83 ms]
dfb = pandas.DataFrame(memo)[["average", "lib", "batch"]]
piv = dfb.pivot("batch", "lib", "average")
piv
lib cython numba numpy onnxruntime python sklearn
batch
1 0.000003 0.000002 0.000010 0.000013 0.000010 0.000050
10 0.000016 0.000011 0.000009 0.000014 0.000091 0.000048
100 0.000145 0.000091 0.000011 0.000028 0.000885 0.000058
200 0.000301 0.000188 0.000012 0.000038 0.001767 0.000060
500 0.000720 0.000463 0.000012 0.000080 0.004504 0.000052
1000 0.001569 0.000891 0.000016 0.000138 0.008544 0.000072
2000 NaN 0.002224 0.000021 0.000300 NaN 0.000076
3000 NaN 0.002674 0.000026 0.000405 NaN 0.000083
4000 NaN 0.004018 0.000029 0.000513 NaN 0.000096
5000 NaN 0.004644 0.000041 0.000692 NaN 0.000099
10000 NaN 0.008801 0.000048 0.001369 NaN 0.000142
20000 NaN NaN 0.000091 0.002738 NaN 0.000214
50000 NaN NaN 0.000449 0.008820 NaN 0.000849
75000 NaN NaN 0.000402 0.012992 NaN 0.001533
100000 NaN NaN 0.000579 0.017411 NaN 0.002090
150000 NaN NaN 0.002956 0.026082 NaN 0.003969
200000 NaN NaN 0.004460 0.037523 NaN 0.006321
300000 NaN NaN 0.006267 0.058988 NaN 0.008155
400000 NaN NaN 0.008238 0.070634 NaN 0.011321
500000 NaN NaN 0.009431 0.085967 NaN 0.012851
600000 NaN NaN 0.011956 0.104532 NaN 0.017034
for c in piv.columns:
    piv["ave_" + c] = piv[c] / piv.index
piv
lib cython numba numpy onnxruntime python sklearn ave_cython ave_numba ave_numpy ave_onnxruntime ave_python ave_sklearn
batch
1 0.000003 0.000002 0.000010 0.000013 0.000010 0.000050 0.000003 2.374700e-06 9.969800e-06 1.263570e-05 0.000010 5.035390e-05
10 0.000016 0.000011 0.000009 0.000014 0.000091 0.000048 0.000002 1.052880e-06 9.039700e-07 1.387610e-06 0.000009 4.789650e-06
100 0.000145 0.000091 0.000011 0.000028 0.000885 0.000058 0.000001 9.061520e-07 1.077050e-07 2.830330e-07 0.000009 5.776300e-07
200 0.000301 0.000188 0.000012 0.000038 0.001767 0.000060 0.000002 9.386710e-07 5.805900e-08 1.894435e-07 0.000009 3.003940e-07
500 0.000720 0.000463 0.000012 0.000080 0.004504 0.000052 0.000001 9.267770e-07 2.428640e-08 1.601118e-07 0.000009 1.033056e-07
1000 0.001569 0.000891 0.000016 0.000138 0.008544 0.000072 0.000002 8.912436e-07 1.637500e-08 1.383949e-07 0.000009 7.172380e-08
2000 NaN 0.002224 0.000021 0.000300 NaN 0.000076 NaN 1.112160e-06 1.073035e-08 1.498589e-07 NaN 3.783965e-08
3000 NaN 0.002674 0.000026 0.000405 NaN 0.000083 NaN 8.911731e-07 8.547533e-09 1.349518e-07 NaN 2.766113e-08
4000 NaN 0.004018 0.000029 0.000513 NaN 0.000096 NaN 1.004487e-06 7.331600e-09 1.282948e-07 NaN 2.399990e-08
5000 NaN 0.004644 0.000041 0.000692 NaN 0.000099 NaN 9.288056e-07 8.205800e-09 1.384403e-07 NaN 1.970420e-08
10000 NaN 0.008801 0.000048 0.001369 NaN 0.000142 NaN 8.800830e-07 4.844200e-09 1.368824e-07 NaN 1.424145e-08
20000 NaN NaN 0.000091 0.002738 NaN 0.000214 NaN NaN 4.574600e-09 1.369245e-07 NaN 1.071715e-08
50000 NaN NaN 0.000449 0.008820 NaN 0.000849 NaN NaN 8.982860e-09 1.764029e-07 NaN 1.697171e-08
75000 NaN NaN 0.000402 0.012992 NaN 0.001533 NaN NaN 5.365300e-09 1.732245e-07 NaN 2.044074e-08
100000 NaN NaN 0.000579 0.017411 NaN 0.002090 NaN NaN 5.788455e-09 1.741052e-07 NaN 2.089537e-08
150000 NaN NaN 0.002956 0.026082 NaN 0.003969 NaN NaN 1.970775e-08 1.738810e-07 NaN 2.645820e-08
200000 NaN NaN 0.004460 0.037523 NaN 0.006321 NaN NaN 2.229910e-08 1.876159e-07 NaN 3.160423e-08
300000 NaN NaN 0.006267 0.058988 NaN 0.008155 NaN NaN 2.089083e-08 1.966262e-07 NaN 2.718413e-08
400000 NaN NaN 0.008238 0.070634 NaN 0.011321 NaN NaN 2.059604e-08 1.765860e-07 NaN 2.830290e-08
500000 NaN NaN 0.009431 0.085967 NaN 0.012851 NaN NaN 1.886231e-08 1.719346e-07 NaN 2.570212e-08
600000 NaN NaN 0.011956 0.104532 NaN 0.017034 NaN NaN 1.992636e-08 1.742197e-07 NaN 2.838989e-08
libs = list(c for c in piv.columns if "ave_" in c)
ax = piv.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch")
ax.grid(True);
../_images/cffi_linear_regression_119_0.png

Le minimum obtenu est pour 10^{-8} s soit 10 ns. Cela montre que la comparaisson précédente était incomplète voire biaisée. Tout dépend de l’usage qu’on fait de la fonction de prédiction même s’il sera toujours possible de d’écrire un code spécialisé plus rapide que toute autre fonction générique. En général, plus on reste du côté Python, plus le programme est lent. Le nombre de passage de l’un à l’autre, selon la façon dont il est fait ralenti aussi. En tenant compte de cela, le programme rouge sera plus lent que le vert.

from pyquickhelper.helpgen import NbImage
NbImage("pycpp.png")
../_images/cffi_linear_regression_121_0.png

Ces résultats sont d’une façon générale assez volatile car le temps de calcul est enrobé dans plusieurs fonctions Python qui rendent une mesure précise difficile. Il reste néanmoins une bonne idée des ordres de grandeurs.

Random Forest

On reproduit les mêmes résultats pour une random forest mais la réécriture n’est plus aussi simple qu’une régression linéaire.

Une prédiction à la fois

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(diabetes_X_train, diabetes_y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
memo_time = []
x = diabetes_X_test[:1]
memo_time.append(timeexe("sklearn-rf", "rf.predict(x)", repeat=100, number=20))
Moyenne: 691.37 µs Ecart-type 89.73 µs (with 20 runs) in [638.31 µs, 946.03 µs]

C’est beaucoup plus long que la régression linéaire. On essaye avec onnx.

if ok_onnx:
    onnxrf_model = convert_sklearn(
        rf, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))])
    save_model(onnxrf_model, 'model_rf.onnx')
    model_onnx = onnx.load('model_rf.onnx')
if ok_onnx:
    sess = onnxruntime.InferenceSession("model_rf.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt_rf(x):
        return sess.run(["variable"], {'input': x})

    print(predict_onnxrt_rf(x.astype(numpy.float32)))
    memo_time.append(timeexe("onnx-rf", "predict_onnxrt_rf(x.astype(numpy.float32))", repeat=100, number=20))
Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1])
[array([[193.4]], dtype=float32)]
Moyenne: 15.61 µs Ecart-type 10.12 µs (with 20 runs) in [12.68 µs, 25.82 µs]

C’est beaucoup plus rapide.

import pandas
df2 = pandas.DataFrame(data=memo_time)
df2 = df2.set_index("legend").sort_values("average")
df2
average deviation first first3 last3 repeat min5 max5 code run
legend
onnx-rf 0.000016 0.00001 0.000109 0.000053 0.000018 100 0.000013 0.000026 predict_onnxrt_rf(x.astype(numpy.float32)) 20
sklearn-rf 0.000691 0.00009 0.001059 0.000838 0.000650 100 0.000638 0.000946 rf.predict(x) 20
fig, ax = plt.subplots(1, 1, figsize=(14,4))
df2[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                   legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
../_images/cffi_linear_regression_133_0.png

Prédiction en batch

memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
repeat = 10
for i in batch[:15]:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)

    memo.append(timeexe("sklearn.predict %d" % i, "rf.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt_rf(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 837.72 µs Ecart-type 230.66 µs (with 10 runs) in [656.11 µs, 1.46 ms]
Moyenne: 18.29 µs Ecart-type 11.65 µs (with 10 runs) in [12.57 µs, 52.32 µs]
batch = 10
Moyenne: 695.72 µs Ecart-type 70.97 µs (with 10 runs) in [640.66 µs, 897.78 µs]
Moyenne: 34.93 µs Ecart-type 11.24 µs (with 10 runs) in [29.73 µs, 68.25 µs]
batch = 100
Moyenne: 854.57 µs Ecart-type 209.18 µs (with 10 runs) in [715.64 µs, 1.45 ms]
Moyenne: 217.84 µs Ecart-type 16.69 µs (with 10 runs) in [205.13 µs, 261.42 µs]
batch = 200
Moyenne: 797.57 µs Ecart-type 31.20 µs (with 10 runs) in [758.16 µs, 843.70 µs]
Moyenne: 411.66 µs Ecart-type 18.53 µs (with 10 runs) in [388.02 µs, 450.80 µs]
batch = 500
Moyenne: 1.06 ms Ecart-type 199.81 µs (with 10 runs) in [887.37 µs, 1.48 ms]
Moyenne: 991.69 µs Ecart-type 24.41 µs (with 10 runs) in [956.97 µs, 1.03 ms]
batch = 1000
Moyenne: 1.11 ms Ecart-type 54.61 µs (with 10 runs) in [1.06 ms, 1.22 ms]
Moyenne: 2.00 ms Ecart-type 141.03 µs (with 10 runs) in [1.93 ms, 2.42 ms]
batch = 2000
Moyenne: 1.51 ms Ecart-type 144.95 µs (with 10 runs) in [1.41 ms, 1.93 ms]
Moyenne: 4.01 ms Ecart-type 224.46 µs (with 10 runs) in [3.81 ms, 4.60 ms]
batch = 3000
Moyenne: 1.80 ms Ecart-type 41.07 µs (with 10 runs) in [1.77 ms, 1.89 ms]
Moyenne: 6.15 ms Ecart-type 1.06 ms (with 10 runs) in [5.66 ms, 9.30 ms]
batch = 4000
Moyenne: 2.37 ms Ecart-type 366.07 µs (with 10 runs) in [2.12 ms, 3.39 ms]
Moyenne: 7.97 ms Ecart-type 679.47 µs (with 10 runs) in [7.48 ms, 9.91 ms]
batch = 5000
Moyenne: 2.57 ms Ecart-type 171.02 µs (with 10 runs) in [2.43 ms, 3.03 ms]
Moyenne: 9.60 ms Ecart-type 253.97 µs (with 10 runs) in [9.23 ms, 10.02 ms]
batch = 10000
Moyenne: 4.28 ms Ecart-type 103.22 µs (with 10 runs) in [4.15 ms, 4.51 ms]
Moyenne: 19.20 ms Ecart-type 289.60 µs (with 10 runs) in [18.83 ms, 19.91 ms]
batch = 20000
Moyenne: 7.84 ms Ecart-type 198.24 µs (with 10 runs) in [7.59 ms, 8.22 ms]
Moyenne: 38.51 ms Ecart-type 394.95 µs (with 10 runs) in [38.01 ms, 39.10 ms]
batch = 50000
Moyenne: 19.40 ms Ecart-type 256.43 µs (with 10 runs) in [19.09 ms, 19.84 ms]
Moyenne: 97.66 ms Ecart-type 740.01 µs (with 10 runs) in [96.60 ms, 99.54 ms]
batch = 75000
Moyenne: 28.80 ms Ecart-type 332.27 µs (with 10 runs) in [28.29 ms, 29.40 ms]
Moyenne: 147.44 ms Ecart-type 3.00 ms (with 10 runs) in [145.41 ms, 155.52 ms]
batch = 100000
Moyenne: 37.85 ms Ecart-type 416.17 µs (with 10 runs) in [37.13 ms, 38.36 ms]
Moyenne: 213.95 ms Ecart-type 16.26 ms (with 10 runs) in [195.66 ms, 247.30 ms]
dfbrf = pandas.DataFrame(memo)[["average", "lib", "batch"]]
pivrf = dfbrf.pivot("batch", "lib", "average")
for c in pivrf.columns:
    pivrf["ave_" + c] = pivrf[c] / pivrf.index
libs = list(c for c in pivrf.columns if "ave_" in c)
ax = pivrf.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch\nrandom forest")
ax.grid(True);
../_images/cffi_linear_regression_136_0.png