Optimisation de code avec cffi, numba, cython#

Links: notebook, html, python, slides, GitHub

L’idée est de recoder une fonction en C. On prend comme exemple la fonction de prédiction de la régression linéaire de scikit-learn et de prévoir le gain de temps qu’on obtient en recodant la fonction dans un langage plus rapide.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

memo_time = []
import timeit

def unit(x):
    if x >= 1: return "%1.2f s" % x
    elif x >= 1e-3: return "%1.2f ms" % (x* 1000)
    elif x >= 1e-6: return "%1.2f µs" % (x* 1000**2)
    elif x >= 1e-9: return "%1.2f ns" % (x* 1000**3)
    else:
        return "%1.2g s" % x

def timeexe(legend, code, number=100, repeat=1000):
    rep = timeit.repeat(code, number=number, repeat=repeat, globals=globals())
    ave = sum(rep) / (number * repeat)
    std = (sum((x/number - ave)**2 for x in rep) / repeat)**0.5
    fir = rep[0]/number
    fir3 = sum(rep[:3]) / (3 * number)
    las3 = sum(rep[-3:]) / (3 * number)
    rep.sort()
    mini = rep[len(rep)//20] / number
    maxi = rep[-len(rep)//20] / number
    print("Moyenne: %s Ecart-type %s (with %d runs) in [%s, %s]" % (
                unit(ave), unit(std), number, unit(mini), unit(maxi)))
    return dict(legend=legend, average=ave, deviation=std, first=fir, first3=fir3,
                last3=las3, repeat=repeat, min5=mini, max5=maxi, code=code, run=number)

Régression linéaire #

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

diabetes = load_diabetes()
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes.data, diabetes.target)

from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression()

clr.coef_

array([ -35.81159278, -267.39308261,  503.56121841,  337.87944184,
       -577.27255236,  373.62939477,  -99.69779327,   78.39842094,
        656.54309153,   80.3383998 ])

clr.intercept_

152.69613239933642

z = diabetes_X_test[0:1,:]
memo_time.append(timeexe("sklearn.predict", "clr.predict(z)"))

Moyenne: 45.50 µs Ecart-type 6.34 µs (with 100 runs) in [40.87 µs, 52.95 µs]

%timeit clr.predict(z)

45.2 µs ± 744 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

optimisation avec cffi #

On s’inspire de l’exemple Purely for performance (API level, out-of-line).

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg(int, double *, double *, double, double *);")

ffibuilder.set_source("_linear_regression",
r"""
    static int linreg(int dimension, double * x, double *coef, double intercept, double * out)
    {
        for(; dimension > 0; --dimension, ++x, ++coef)
            intercept += *x * *coef;
        *out = intercept;
        return 1;
    }
""")

ffibuilder.compile(verbose=True)

generating ._linear_regression.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression' extension
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression.c /Fo.Release_linear_regression.obj
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression .Release_linear_regression.obj /OUT:._linear_regression.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression.cp39-win_amd64.lib

'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression.cp39-win_amd64.pyd'

La fonction compilée est accessible comme suit.

from _linear_regression import ffi, lib
lib.linreg

<function _linear_regression.Lib.linreg>

On s’inspire de l’exemple How to pass a Numpy array into a cffi function and how to get one back out?.

import numpy
out = numpy.zeros(1)

ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )

x = diabetes_X_test[0:1,:]
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )

ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )

n = len(clr.coef_)
lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)

out

array([214.72477745])

On vérifie qu’on obtient bien la même chose.

clr.predict(x)

array([214.72477745])

Et on mesure le temps d’exécution :

memo_time.append(timeexe("cffi-linreg", "lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)"))

Moyenne: 831.37 ns Ecart-type 708.08 ns (with 100 runs) in [416.00 ns, 1.52 µs]

C’est beaucoup plus rapide. Pour être totalement honnête, il faut mesurer les étapes qui consiste à extraire les pointeurs.

def predict_clr(x, clr):
    out = numpy.zeros(1)
    ptr_coef = clr.coef_.__array_interface__['data'][0]
    cptr_coef = ffi.cast ( "double*" , ptr_coef )
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "double*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "double*" , ptr_out )
    lib.linreg(len(x), cptr_x, cptr_coef, clr.intercept_, cptr_out)
    return out

predict_clr(x, clr)

array([154.32457426])

memo_time.append(timeexe("cffi-linreg-wrapped", "predict_clr(x, clr)"))

Moyenne: 7.52 µs Ecart-type 2.34 µs (with 100 runs) in [6.20 µs, 10.42 µs]

Cela reste plus rapide.

cffi - seconde version #

Comme on construit la fonction en dynamique (le code est connu lors de l’exécution), on peut facilement se passer de la boucle et écrire le code sans boucle et avec les coefficients.

res = " + ".join("{0}*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res

'-35.81159277952622*x[0] + -267.39308260812277*x[1] + 503.56121841083586*x[2] + 337.87944183803455*x[3] + -577.2725523621144*x[4] + 373.6293947654621*x[5] + -99.69779326605845*x[6] + 78.39842093764699*x[7] + 656.5430915289373*x[8] + 80.33839980437061*x[9]'

code = """
    static int linreg_custom(double * x, double * out)
    {{
        out[0] = {0} + {1};
    }}
""".format(clr.intercept_, res)
print(code)

static int linreg_custom(double * x, double * out)
{
    out[0] = 152.69613239933642 + -35.81159277952622*x[0] + -267.39308260812277*x[1] + 503.56121841083586*x[2] + 337.87944183803455*x[3] + -577.2725523621144*x[4] + 373.6293947654621*x[5] + -99.69779326605845*x[6] + 78.39842093764699*x[7] + 656.5430915289373*x[8] + 80.33839980437061*x[9];
}

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom(double *, double *);")
ffibuilder.set_source("_linear_regression_custom", code)
ffibuilder.compile(verbose=True)

generating ._linear_regression_custom.c
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom' extension
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression_custom.c /Fo.Release_linear_regression_custom.obj
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression_custom .Release_linear_regression_custom.obj /OUT:._linear_regression_custom.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom.cp39-win_amd64.lib

'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom.cp39-win_amd64.pyd'

from _linear_regression_custom.lib import linreg_custom
linreg_custom(cptr_x, cptr_out)
out

array([214.72477745])

memo_time.append(timeexe("cffi-linreg-custom", "linreg_custom(cptr_x, cptr_out)"))

Moyenne: 466.52 ns Ecart-type 851.96 ns (with 100 runs) in [315.00 ns, 715.00 ns]

On a gagné un facteur 2.

def predict_clr_custom(x):
    out = numpy.zeros(1)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast("double*", ptr_x)
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast("double*", ptr_out)
    linreg_custom(cptr_x, cptr_out)
    return out

predict_clr_custom(x)

array([214.72477745])

memo_time.append(timeexe("cffi-linreg-custom wrapped", "predict_clr_custom(x)"))

Moyenne: 5.27 µs Ecart-type 1.82 µs (with 100 runs) in [4.42 µs, 7.77 µs]

C’est un peu plus rapide.

et en float?#

L’ordinateur fait la distinction entre les double code sur 64 bit et les float codé sur 32 bits. La précision est meilleure dans le premier cas et les calculs sont plus rapides dans le second. Dans le cas du machine learning, on préfère la rapidité à une perte précision en précision qui est souvent compensée par l’optimisation inhérente à tout problème de machine learning. Ce qu’on perd sur une observation, on le retrouve sur une autre.

res = " + ".join("{0}f*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res

'-35.81159277952622f*x[0] + -267.39308260812277f*x[1] + 503.56121841083586f*x[2] + 337.87944183803455f*x[3] + -577.2725523621144f*x[4] + 373.6293947654621f*x[5] + -99.69779326605845f*x[6] + 78.39842093764699f*x[7] + 656.5430915289373f*x[8] + 80.33839980437061f*x[9]'

code = """
    static int linreg_custom_float(float * x, float * out)
    {{
        out[0] = {0}f + {1};
    }}
""".format(clr.intercept_, res)
print(code)

static int linreg_custom_float(float * x, float * out)
{
    out[0] = 152.69613239933642f + -35.81159277952622f*x[0] + -267.39308260812277f*x[1] + 503.56121841083586f*x[2] + 337.87944183803455f*x[3] + -577.2725523621144f*x[4] + 373.6293947654621f*x[5] + -99.69779326605845f*x[6] + 78.39842093764699f*x[7] + 656.5430915289373f*x[8] + 80.33839980437061f*x[9];
}

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float", code)
ffibuilder.compile(verbose=True)

generating ._linear_regression_custom_float.c
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float' extension
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression_custom_float.c /Fo.Release_linear_regression_custom_float.obj
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression_custom_float .Release_linear_regression_custom_float.obj /OUT:._linear_regression_custom_float.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float.cp39-win_amd64.lib

'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float.cp39-win_amd64.pyd'

from _linear_regression_custom_float.lib import linreg_custom_float

def predict_clr_custom_float(x):
    out = numpy.zeros(1, dtype=numpy.float32)
    ptr_x = x.__array_interface__['data'][0]
    cptr_x = ffi.cast ( "float*" , ptr_x )
    ptr_out = out.__array_interface__['data'][0]
    cptr_out = ffi.cast ( "float*" , ptr_out )
    linreg_custom_float(cptr_x, cptr_out)
    return out

Avant d’appeler la fonction, on doit transformer le vecteur iniatial en float32.

x32 = x.astype(numpy.float32)
predict_clr_custom(x32)

array([1.27301276e+31])

memo_time.append(timeexe("cffi-linreg-custom-float wrapped", "predict_clr_custom(x32)"))

Moyenne: 5.12 µs Ecart-type 1.60 µs (with 100 runs) in [4.48 µs, 6.44 µs]

La différence n’est pas flagrante. Mesurons le code C uniquement même si la partie Python ne peut pas être complètement évitée.

out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

memo_time.append(timeexe("cffi-linreg-custom-float32", "linreg_custom_float(cptr_x, cptr_out)"))

Moyenne: 389.19 ns Ecart-type 226.75 ns (with 100 runs) in [317.00 ns, 577.00 ns]

La différence n’est pas significative.

SIMD #

C’est un ensemble d’instructions processeur pour faire des opérations terme à terme sur 4 float32 aussi rapidement qu’une seule. Le processeur ne peut faire des opérations que les nombres sont copiés dans ses registres. Le programme passe alors son temps à copier des nombres depuis la mémoire vers les registres du processeur puis à faire la copie dans le chemin inverse pour le résultat. Les instructions SIMD font gagner du temps du niveau du calcul. Au lieu de faire 4 opérations de multiplication terme à terme, il n’en fait plus qu’une. Il suffit de savoir comment utiliser ces instructions. Avec Visual Studio, elles sont accessible via ces fonctions Memory and Initialization Using Streaming SIMD Extensions. Le code suivant n’est probablement pas optimal mais il n’est pas trop compliqué à suivre.

code = """
#include <xmmintrin.h>

static int linreg_custom_float_simd(float * x, float * out)
{
    __m128 c1 = _mm_set_ps(0.3034995490664121f, -237.63931533353392f, 510.5306054362245f, 327.7369804093466f);
    __m128 c2 = _mm_set_ps(-814.1317093725389f, 492.81458798373245f, 102.84845219168025f, 184.60648905984064f);
    __m128 r1 = _mm_set_ss(152.76430691633442f);
    r1 = _mm_add_ss(r1, _mm_mul_ps(c1, _mm_load_ps(x)));
    r1 = _mm_add_ss(r1, _mm_mul_ps(c2, _mm_load_ps(x+4)));
    float r[4];
    _mm_store_ps(r, r1);
    out[0] = r[0] + r[1] + r[2] + r[3] + 743.5196167505419f * x[8] + 76.095172216624f * x[9];
    return 1;
}
"""

from cffi import FFI
ffibuilder = FFI()

ffibuilder.cdef("int linreg_custom_float_simd(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float_simd", code)
ffibuilder.compile(verbose=True)

generating ._linear_regression_custom_float_simd.c
(already up-to-date)
the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a'
running build_ext
building '_linear_regression_custom_float_simd' extension
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression_custom_float_simd.c /Fo.Release_linear_regression_custom_float_simd.obj
C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression_custom_float_simd .Release_linear_regression_custom_float_simd.obj /OUT:._linear_regression_custom_float_simd.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float_simd.cp39-win_amd64.lib

'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float_simd.cp39-win_amd64.pyd'

from _linear_regression_custom_float_simd.lib import linreg_custom_float_simd

out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )

linreg_custom_float_simd(cptr_x, cptr_out)
out

array([172.00832], dtype=float32)

memo_time.append(timeexe("cffi-linreg-custom-float32-simd", "linreg_custom_float_simd(cptr_x, cptr_out)"))

Moyenne: 418.99 ns Ecart-type 387.18 ns (with 100 runs) in [299.00 ns, 631.00 ns]

C’est légèrement mieux, quelques références :

aligned_vs_unaligned_load.c : c’est du code mais facile à lire.
How to Write Fast Numerical Code

Les processeurs évoluent au fil du temps, 4 float, 8 float, SIMD2, FMA4 Intrinsics Added for Visual Studio 2010 SP1, AVX.

Réécriture purement Python #

On continue avec uniquement du Python sans numpy.

coef = clr.coef_
list(coef)

[-35.81159277952622,
 -267.39308260812277,
 503.56121841083586,
 337.87944183803455,
 -577.2725523621144,
 373.6293947654621,
 -99.69779326605845,
 78.39842093764699,
 656.5430915289373,
 80.33839980437061]

code = str(clr.intercept_) + "+" + "+".join("x[{0}]*({1})".format(i, c) for i, c in enumerate(coef))
code

'152.69613239933642+x[0]*(-35.81159277952622)+x[1]*(-267.39308260812277)+x[2]*(503.56121841083586)+x[3]*(337.87944183803455)+x[4]*(-577.2725523621144)+x[5]*(373.6293947654621)+x[6]*(-99.69779326605845)+x[7]*(78.39842093764699)+x[8]*(656.5430915289373)+x[9]*(80.33839980437061)'

def predict_clr_python(x):
    return 152.764306916+x[0]*0.3034995490664121+x[1]*(-237.63931533353392)+x[2]*510.5306054362245+ \
            x[3]*327.7369804093466+ \
            x[4]*(-814.1317093725389)+x[5]*492.81458798373245+x[6]*102.84845219168025+ \
            x[7]*184.60648905984064+x[8]*743.5196167505419+x[9]*76.095172216624

predict_clr_python(x[0])

211.03463170273153

z = list(x[0])
memo_time.append(timeexe("python-linreg-custom", "predict_clr_python(z)"))

Moyenne: 2.02 µs Ecart-type 670.45 ns (with 100 runs) in [1.70 µs, 2.73 µs]

De façon assez surprenante, c’est plutôt rapide. Et si on y mettait une boucle.

def predict_clr_python_loop(x, coef, intercept):
    return intercept + sum(a*b for a, b in zip(x, coef))

predict_clr_python_loop(x[0], list(clr.coef_), clr.intercept_)

214.72477744760596

coef = list(clr.coef_)
intercept = clr.intercept_
memo_time.append(timeexe("python-linreg", "predict_clr_python_loop(z, coef, intercept)"))

Moyenne: 3.54 µs Ecart-type 1.31 µs (with 100 runs) in [2.68 µs, 6.16 µs]

A peine plus long.

Réécriture avec Python et numpy #

def predict_clr_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numpy(x[0], clr.coef_, clr.intercept_)

214.72477744760596

memo_time.append(timeexe("numpy-linreg-numpy", "predict_clr_numpy(z, coef, clr.intercept_)"))

Moyenne: 8.08 µs Ecart-type 3.44 µs (with 100 runs) in [6.44 µs, 12.16 µs]

Les dimensions des tableaux sont trop petites pour que le calcul matriciel apporte une différence. On se retrouve dans le cas cffi où les échanges Python - C grignotent tout le temps de calcul.

numba #

numba essaye de compiler à la volée des bouts de codes écrits en Python. On induque quelle fonction optimiser en faisant précéder la fonction de @jit. Toutes les écritures ne fonctionnent, typiquement, certaines listes en compréhension soulèvent une exception. Il faut donc écrire son code en Python d’une façon assez proche de ce qu’il serait en C.

from numba import jit

@jit
def predict_clr_numba(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba(z, clr.coef_, clr.intercept_)

C:Python395_x64libsite-packagesnumbacoreir_utils.py:2152: NumbaPendingDeprecationWarning: [1m
Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'x' of function 'predict_clr_numba'.
For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types
[1m
File "<ipython-input-50-3bf9efb9c9c6>", line 2:[0m
[1m@jit
[1mdef predict_clr_numba(x, coef, intercept):
[0m[1m^[0m[0m
[0m
  warnings.warn(NumbaPendingDeprecationWarning(msg, loc=loc))

214.724777447606

memo_time.append(timeexe("numba-linreg-notype", "predict_clr_numba(z, clr.coef_, clr.intercept_)"))

Moyenne: 23.77 µs Ecart-type 7.36 µs (with 100 runs) in [19.99 µs, 37.64 µs]

Plutôt rapide !

@jit('double(double[:], double[:], double)')
def predict_clr_numba_cast(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)

214.724777447606

memo_time.append(timeexe("numba-linreg-type", "predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)"))

Moyenne: 948.20 ns Ecart-type 411.47 ns (with 100 runs) in [759.00 ns, 1.68 µs]

On voit que plus on donne d’information au compilateur, plus il est capable d’optimiser.

@jit('float32(float32[:], float32[:], float32)')
def predict_clr_numba_cast_float(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
x32 = x[0].astype(numpy.float32)
c32 = clr.coef_.astype(numpy.float32)
i32 = numpy.float32(clr.intercept_)
predict_clr_numba_cast_float(x32, c32, i32)

214.7247772216797

memo_time.append(timeexe("numba-linreg-type-float32", "predict_clr_numba_cast_float(x32, c32, i32)"))

Moyenne: 707.08 ns Ecart-type 268.64 ns (with 100 runs) in [565.00 ns, 1.25 µs]

On essaye avec les coefficients dans la fonction.

@jit('double(double[:])')
def predict_clr_numba_cast_custom(x):
    coef = [ 3.03499549e-01, -2.37639315e+02,  5.10530605e+02,  3.27736980e+02,
            -8.14131709e+02,  4.92814588e+02,  1.02848452e+02,  1.84606489e+02,
             7.43519617e+02,  7.60951722e+01]
    s = 152.76430691633442
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_numba_cast_custom(x[0])

211.034631692416

memo_time.append(timeexe("numba-linreg-type-custom", "predict_clr_numba_cast_custom(x[0])"))

Moyenne: 824.35 ns Ecart-type 371.36 ns (with 100 runs) in [652.00 ns, 1.56 µs]

On se rapproche des temps obtenus avec cffi sans wrapping, cela signifie que numba fait un bien meilleur travail à ce niveau que le wrapper rapidement créé.

@jit('double(double[:], double[:], double)')
def predict_clr_numba_numpy(x, coef, intercept):
    return intercept + numpy.dot(coef, x).sum()

predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)

<ipython-input-58-7020de83c055>:1: NumbaWarning: [1m
Compilation is falling back to object mode WITH looplifting enabled because Function "predict_clr_numba_numpy" failed type inference due to: [1m[1mUnknown attribute 'sum' of type float64
[1m
File "<ipython-input-58-7020de83c055>", line 3:[0m
[1mdef predict_clr_numba_numpy(x, coef, intercept):
[1m    return intercept + numpy.dot(coef, x).sum()
[0m    [1m^[0m[0m
[0m
[0m[1mDuring: typing of get attribute at <ipython-input-58-7020de83c055> (3)[0m
[1m
File "<ipython-input-58-7020de83c055>", line 3:[0m
[1mdef predict_clr_numba_numpy(x, coef, intercept):
[1m    return intercept + numpy.dot(coef, x).sum()
[0m    [1m^[0m[0m
[0m
  @jit('double(double[:], double[:], double)')
C:Python395_x64libsite-packagesnumbacoreobject_mode_passes.py:151: NumbaWarning: [1mFunction "predict_clr_numba_numpy" was compiled in object mode without forceobj=True.
[1m
File "<ipython-input-58-7020de83c055>", line 2:[0m
[1m@jit('double(double[:], double[:], double)')
[1mdef predict_clr_numba_numpy(x, coef, intercept):
[0m[1m^[0m[0m
[0m
  warnings.warn(errors.NumbaWarning(warn_msg,
C:Python395_x64libsite-packagesnumbacoreobject_mode_passes.py:161: NumbaDeprecationWarning: [1m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.
For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
[1m
File "<ipython-input-58-7020de83c055>", line 2:[0m
[1m@jit('double(double[:], double[:], double)')
[1mdef predict_clr_numba_numpy(x, coef, intercept):
[0m[1m^[0m[0m
[0m
  warnings.warn(errors.NumbaDeprecationWarning(msg,

214.72477744760596

memo_time.append(timeexe("numba-linreg-type-numpy", "predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)"))

Moyenne: 5.15 µs Ecart-type 1.78 µs (with 100 runs) in [4.37 µs, 6.00 µs]

numba est moins performant quand numpy est impliqué car le code de numpy n’est pas réécrit, il est appelé.

cython #

cython permet de créer des extensions C de plus grande envergure que numba. C’est l’option choisie par scikit-learn. Il vaut mieux connaître le C pour s’en servir et là encore, l’objectif est de réduire les échanges Python / C qui coûtent cher.

%load_ext cython

%%cython
def predict_clr_cython(x, coef, intercept):
    s = intercept
    for i in range(0, len(x)):
        s += coef[i] * x[i]
    return s

predict_clr_cython(x[0], clr.coef_, clr.intercept_)

214.724777447606

memo_time.append(timeexe("cython-linreg", "predict_clr_cython(x[0], clr.coef_, clr.intercept_)"))

Moyenne: 2.71 µs Ecart-type 1.60 µs (with 100 runs) in [1.92 µs, 7.19 µs]

Cython fait moins bien que numba dans notre cas et l’optimisation proposée est assez proche du temps déjà obtenue avec le langage Python seul. Cela est dû au fait que la plupart des objets tels que du code associé aux listes ou aux dictionnaires ont été réécrits en C.

%%cython
cimport numpy as npc

def predict_clr_cython_type(npc.ndarray[double, ndim=1, mode='c'] x,
                            npc.ndarray[double, ndim=1, mode='c'] coef,
                            double intercept):
    cdef double s = intercept
    for i in range(0, x.shape[0]):
        s += coef[i] * x[i]
    return s

predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)

214.724777447606

memo_time.append(timeexe(
    "cython-linreg-type", "predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)"))

Moyenne: 721.31 ns Ecart-type 399.10 ns (with 100 runs) in [533.00 ns, 1.44 µs]

Le temps est quasi identique avec un écart type moins grand de façon significative.

Une dernière option : ONNX #

ONNX est un format de sérialisation qui permet de décrire un modèle de modèle de machine learning ou de deep learning. Cela permet de dissocer le modèle de la librairie qui a servi à le produire (voir ML.net and ONNX).

try:
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType
    import onnxruntime
    import onnx
    ok_onnx = True
    print("onnx, skl2onnx, onnxruntime sont disponibles.")

    def save_model(onnx_model, filename):
        with open(filename, "wb") as f:
            f.write(onnx_model.SerializeToString())
except ImportError as e:
    print("La suite requiert onnx, skl2onnx et onnxruntime.")
    print(e)
    ok_onnx = False

Error in sys.excepthook:
Traceback (most recent call last):
  File "C:Python395_x64libsite-packagesIPythoncoreinteractiveshell.py", line 1934, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'RuntimeError' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "C:Python395_x64libsite-packagesIPythoncoreinteractiveshell.py", line 1936, in showtraceback
    stb = self.InteractiveTB.structured_traceback(etype,
  File "C:Python395_x64libsite-packagesIPythoncoreultratb.py", line 1105, in structured_traceback
    return FormattedTB.structured_traceback(
  File "C:Python395_x64libsite-packagesIPythoncoreultratb.py", line 999, in structured_traceback
    return VerboseTB.structured_traceback(
  File "C:Python395_x64libsite-packagesIPythoncoreultratb.py", line 851, in structured_traceback
    assert etb is not None
AssertionError
Original exception was:
RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

onnx, skl2onnx, onnxruntime sont disponibles.

On convertit le modèle au format ONNX.

if ok_onnx:
    onnx_model = convert_sklearn(
        clr, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))],
        target_opset=11)
    onnx_model.ir_version = 6
    save_model(onnx_model, 'model.onnx')

    model_onnx = onnx.load('model.onnx')
    print("Modèle sérialisé au format ONNX")
    print(model_onnx)
else:
    print("onnx, onnxmltools, onnxruntime sont disponibles.")

Modèle sérialisé au format ONNX
ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.10.4"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
  node {
    input: "input"
    output: "variable"
    name: "LinearRegressor"
    op_type: "LinearRegressor"
    attribute {
      name: "coefficients"
      floats: -35.81159210205078
      floats: -267.3930969238281
      floats: 503.56121826171875
      floats: 337.87945556640625
      floats: -577.2725219726562
      floats: 373.62939453125
      floats: -99.69779205322266
      floats: 78.39842224121094
      floats: 656.5430908203125
      floats: 80.3384017944336
      type: FLOATS
    }
    attribute {
      name: "intercepts"
      floats: 152.69613647460938
      type: FLOATS
    }
    domain: "ai.onnx.ml"
  }
  name: "model"
  input {
    name: "input"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 10
          }
        }
      }
    }
  }
  output {
    name: "variable"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
            dim_value: 1
          }
        }
      }
    }
  }
}
opset_import {
  domain: "ai.onnx.ml"
  version: 1
}
opset_import {
  domain: ""
  version: 11
}

On calcule les prédictions. Le module {onnxruntime](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) optimise les calculs pour des modèles de deep learning. Cela explique pourquoi tous les calculs sont réalisés avec des réels représentés sur 4 octets numpy.float32.

if ok_onnx:
    sess = onnxruntime.InferenceSession("model.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt(x):
        return sess.run(["variable"], {'input': x})

    print("Prediction:", predict_onnxrt(x.astype(numpy.float32)))

Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1])
Prediction: [array([[214.72478]], dtype=float32)]

if ok_onnx:
    x32 = x.astype(numpy.float32)
    memo_time.append(timeexe("onnxruntime-float32", "predict_onnxrt(x32)"))
    memo_time.append(timeexe("onnxruntime-float64", "predict_onnxrt(x.astype(numpy.float32))"))

Moyenne: 13.00 µs Ecart-type 7.69 µs (with 100 runs) in [9.71 µs, 23.64 µs]
Moyenne: 12.69 µs Ecart-type 1.93 µs (with 100 runs) in [11.29 µs, 16.23 µs]

Récapitulatif #

import pandas
df = pandas.DataFrame(data=memo_time)
df = df.set_index("legend").sort_values("average")
df

	average	deviation	first	first3	last3	repeat	min5	max5	code	run
legend
cffi-linreg-custom-float32	3.891910e-07	2.267541e-07	5.608000e-06	2.196000e-06	3.763333e-07	1000	3.170000e-07	5.770000e-07	linreg_custom_float(cptr_x, cptr_out)	100
cffi-linreg-custom-float32-simd	4.189890e-07	3.871792e-07	1.091200e-05	3.949667e-06	2.996667e-07	1000	2.990000e-07	6.310000e-07	linreg_custom_float_simd(cptr_x, cptr_out)	100
cffi-linreg-custom	4.665150e-07	8.519581e-07	2.679900e-05	9.352667e-06	3.256667e-07	1000	3.150000e-07	7.150000e-07	linreg_custom(cptr_x, cptr_out)	100
numba-linreg-type-float32	7.070790e-07	2.686359e-07	1.162000e-06	1.083333e-06	5.663333e-07	1000	5.650000e-07	1.249000e-06	predict_clr_numba_cast_float(x32, c32, i32)	100
cython-linreg-type	7.213150e-07	3.991047e-07	1.252000e-06	8.300000e-07	5.513333e-07	1000	5.330000e-07	1.443000e-06	predict_clr_cython_type(x[0], clr.coef_, clr.i...	100
numba-linreg-type-custom	8.243540e-07	3.713608e-07	7.940000e-07	7.156667e-07	6.543333e-07	1000	6.520000e-07	1.558000e-06	predict_clr_numba_cast_custom(x[0])	100
cffi-linreg	8.313670e-07	7.080831e-07	6.414000e-06	3.244000e-06	4.170000e-07	1000	4.160000e-07	1.519000e-06	lib.linreg(n, cptr_x, cptr_coef, clr.intercept...	100
numba-linreg-type	9.482040e-07	4.114651e-07	9.350000e-07	8.663333e-07	7.596667e-07	1000	7.590000e-07	1.678000e-06	predict_clr_numba_cast(x[0], clr.coef_, clr.in...	100
python-linreg-custom	2.018942e-06	6.704544e-07	5.511000e-06	4.254667e-06	1.703667e-06	1000	1.696000e-06	2.731000e-06	predict_clr_python(z)	100
cython-linreg	2.706254e-06	1.597806e-06	5.083000e-06	5.419333e-06	2.126000e-06	1000	1.920000e-06	7.194000e-06	predict_clr_cython(x[0], clr.coef_, clr.interc...	100
python-linreg	3.539523e-06	1.306156e-06	8.761000e-06	7.510000e-06	2.779667e-06	1000	2.681000e-06	6.164000e-06	predict_clr_python_loop(z, coef, intercept)	100
cffi-linreg-custom-float wrapped	5.123886e-06	1.598363e-06	1.200400e-05	1.176767e-05	4.483000e-06	1000	4.477000e-06	6.436000e-06	predict_clr_custom(x32)	100
numba-linreg-type-numpy	5.147404e-06	1.775723e-06	1.874100e-05	1.572433e-05	4.474333e-06	1000	4.374000e-06	5.996000e-06	predict_clr_numba_numpy(x[0], clr.coef_, clr.i...	100
cffi-linreg-custom wrapped	5.274568e-06	1.823247e-06	2.166200e-05	2.268700e-05	5.626667e-06	1000	4.422000e-06	7.773000e-06	predict_clr_custom(x)	100
cffi-linreg-wrapped	7.519599e-06	2.343424e-06	1.580000e-05	2.028933e-05	6.263333e-06	1000	6.201000e-06	1.041900e-05	predict_clr(x, clr)	100
numpy-linreg-numpy	8.081947e-06	3.442724e-06	3.679000e-05	3.075167e-05	6.525667e-06	1000	6.442000e-06	1.216200e-05	predict_clr_numpy(z, coef, clr.intercept_)	100
onnxruntime-float64	1.269215e-05	1.926911e-06	1.742200e-05	1.337233e-05	1.133667e-05	1000	1.129500e-05	1.623200e-05	predict_onnxrt(x.astype(numpy.float32))	100
onnxruntime-float32	1.299773e-05	7.686900e-06	2.281400e-05	1.689933e-05	1.009533e-05	1000	9.713000e-06	2.363700e-05	predict_onnxrt(x32)	100
numba-linreg-notype	2.376539e-05	7.362380e-06	3.079800e-05	2.445400e-05	3.723367e-05	1000	1.998900e-05	3.763900e-05	predict_clr_numba(z, clr.coef_, clr.intercept_)	100
sklearn.predict	4.550096e-05	6.337585e-06	7.724200e-05	6.447133e-05	4.143867e-05	1000	4.087300e-05	5.295400e-05	clr.predict(z)	100

On enlève quelques colonnes et on rappelle :

cffi: signifie optimisé avec cffi
custom: pas de boucle mais la fonction ne peut prédire qu’une seule régression linéaire
float32: utilise des float et non des double
linreg: régression linéaire
numba: optimisation avec numba
numpy: optimisation avec numpy
python: pas de C, que du python
simd: optimisé avec les instructions SIMD
sklearn: fonction sklearn.predict
static: la fonction utilise des variables statiques
type: la fonction est typée et ne fonctionne qu’avec un type précis en entrée.
wrapped: code optimisé mais embabllé dans une fonction Python qui elle ne l’est pas (les containers sont recréés à chaque fois)

cols = ["average", "deviation", "min5", "max5", "run", "code"]
df[cols]

	average	deviation	min5	max5	run	code
legend
cffi-linreg-custom-float32	3.891910e-07	2.267541e-07	3.170000e-07	5.770000e-07	100	linreg_custom_float(cptr_x, cptr_out)
cffi-linreg-custom-float32-simd	4.189890e-07	3.871792e-07	2.990000e-07	6.310000e-07	100	linreg_custom_float_simd(cptr_x, cptr_out)
cffi-linreg-custom	4.665150e-07	8.519581e-07	3.150000e-07	7.150000e-07	100	linreg_custom(cptr_x, cptr_out)
numba-linreg-type-float32	7.070790e-07	2.686359e-07	5.650000e-07	1.249000e-06	100	predict_clr_numba_cast_float(x32, c32, i32)
cython-linreg-type	7.213150e-07	3.991047e-07	5.330000e-07	1.443000e-06	100	predict_clr_cython_type(x[0], clr.coef_, clr.i...
numba-linreg-type-custom	8.243540e-07	3.713608e-07	6.520000e-07	1.558000e-06	100	predict_clr_numba_cast_custom(x[0])
cffi-linreg	8.313670e-07	7.080831e-07	4.160000e-07	1.519000e-06	100	lib.linreg(n, cptr_x, cptr_coef, clr.intercept...
numba-linreg-type	9.482040e-07	4.114651e-07	7.590000e-07	1.678000e-06	100	predict_clr_numba_cast(x[0], clr.coef_, clr.in...
python-linreg-custom	2.018942e-06	6.704544e-07	1.696000e-06	2.731000e-06	100	predict_clr_python(z)
cython-linreg	2.706254e-06	1.597806e-06	1.920000e-06	7.194000e-06	100	predict_clr_cython(x[0], clr.coef_, clr.interc...
python-linreg	3.539523e-06	1.306156e-06	2.681000e-06	6.164000e-06	100	predict_clr_python_loop(z, coef, intercept)
cffi-linreg-custom-float wrapped	5.123886e-06	1.598363e-06	4.477000e-06	6.436000e-06	100	predict_clr_custom(x32)
numba-linreg-type-numpy	5.147404e-06	1.775723e-06	4.374000e-06	5.996000e-06	100	predict_clr_numba_numpy(x[0], clr.coef_, clr.i...
cffi-linreg-custom wrapped	5.274568e-06	1.823247e-06	4.422000e-06	7.773000e-06	100	predict_clr_custom(x)
cffi-linreg-wrapped	7.519599e-06	2.343424e-06	6.201000e-06	1.041900e-05	100	predict_clr(x, clr)
numpy-linreg-numpy	8.081947e-06	3.442724e-06	6.442000e-06	1.216200e-05	100	predict_clr_numpy(z, coef, clr.intercept_)
onnxruntime-float64	1.269215e-05	1.926911e-06	1.129500e-05	1.623200e-05	100	predict_onnxrt(x.astype(numpy.float32))
onnxruntime-float32	1.299773e-05	7.686900e-06	9.713000e-06	2.363700e-05	100	predict_onnxrt(x32)
numba-linreg-notype	2.376539e-05	7.362380e-06	1.998900e-05	3.763900e-05	100	predict_clr_numba(z, clr.coef_, clr.intercept_)
sklearn.predict	4.550096e-05	6.337585e-06	4.087300e-05	5.295400e-05	100	clr.predict(z)

%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(14,6))
df[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                  legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");

<ipython-input-73-4cda8b1055aa>:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  ax.grid(b=True, which="major")
<ipython-input-73-4cda8b1055aa>:8: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  ax.grid(b=True, which="minor");

../_images/cffi_linear_regression_109_1.png

Il manque à ce comparatif le GPU mais c’est un peu plus complexe à mettre en oeuvre, il faut une carte GPU et la parallélisation n’apporterait pas énormément compte tenu de la faible dimension du problème.

Prédiction one-off et biais de mesure #

Le graphique précédent montre que la fonction predict de scikit-learn est la plus lente. La première raison est que ce code est valable pour toutes les régresssions linéaires alors que toutes les autres fonctions sont spécialisées pour un seul modèle. La seconde raison est que le code de scikit-learn est optimisé pour le calcul de plusieurs prédictions à la fois alors que toutes les autres fonctions n’en calcule qu’une seule (scénario dit one-off). On compare à ce que donnerait unev version purement python et numpy.

def predict_clr_python_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.zeros((x.shape[0], 1))
    res[:, 0] = intercept
    for i in range(0, x.shape[0]):
        res[i, 0] += sum(a*b for a, b in zip(x[i, :], coef))
    return res

predict_clr_python_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)

array([[214.72477745],
       [175.29091463]])

def predict_clr_numpy_loop_multi(x, coef, intercept):
    # On s'attend à deux dimension.
    res = numpy.ones((x.shape[0], 1)) * intercept
    res += x @ coef.reshape((len(coef), 1))
    return res

predict_clr_numpy_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)

array([[214.72477745],
       [175.29091463]])

def predict_clr_numba_cast_multi(X, coef, intercept):
    return [predict_clr_numba_cast(x, coef, intercept) for x in X]

predict_clr_numba_cast_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)

[214.724777447606, 175.29091463098356]

def predict_clr_cython_type_multi(X, coef, intercept):
    return [predict_clr_cython_type(x, coef, intercept) for x in X]

predict_clr_cython_type_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)

[214.724777447606, 175.29091463098356]

memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
for i in batch:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)
    repeat=20 if i >= 5000 else 100

    memo.append(timeexe("sklearn.predict %d" % i, "clr.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if i <= 1000:
        # très lent
        memo.append(timeexe("python %d" % i, "predict_clr_python_loop_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=20, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "python"

    memo.append(timeexe("numpy %d" % i, "predict_clr_numpy_loop_multi(mx, clr.coef_, clr.intercept_)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "numpy"

    if i <= 10000:
        # très lent
        memo.append(timeexe("numba %d" % i, "predict_clr_numba_cast_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "numba"

    if i <= 1000:
        # très lent
        memo.append(timeexe("cython %d" % i, "predict_clr_cython_type_multi(mx, clr.coef_, clr.intercept_)",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "cython"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"

batch = 1
Moyenne: 62.45 µs Ecart-type 33.39 µs (with 10 runs) in [41.71 µs, 119.15 µs]
Moyenne: 6.43 µs Ecart-type 496.70 ns (with 10 runs) in [6.26 µs, 8.59 µs]
Moyenne: 12.10 µs Ecart-type 8.26 µs (with 10 runs) in [8.20 µs, 26.70 µs]
Moyenne: 1.99 µs Ecart-type 259.55 ns (with 10 runs) in [1.95 µs, 1.99 µs]
Moyenne: 1.61 µs Ecart-type 103.91 ns (with 10 runs) in [1.59 µs, 1.63 µs]
Moyenne: 19.38 µs Ecart-type 11.88 µs (with 10 runs) in [13.14 µs, 43.26 µs]
batch = 10
Moyenne: 73.96 µs Ecart-type 41.93 µs (with 10 runs) in [43.50 µs, 116.26 µs]
Moyenne: 101.06 µs Ecart-type 3.73 µs (with 10 runs) in [98.16 µs, 111.41 µs]
Moyenne: 19.06 µs Ecart-type 31.16 µs (with 10 runs) in [11.86 µs, 25.57 µs]
Moyenne: 10.84 µs Ecart-type 5.26 µs (with 10 runs) in [8.34 µs, 22.70 µs]
Moyenne: 10.46 µs Ecart-type 5.49 µs (with 10 runs) in [5.69 µs, 20.84 µs]
Moyenne: 19.66 µs Ecart-type 25.05 µs (with 10 runs) in [12.23 µs, 34.34 µs]
batch = 100
Moyenne: 68.65 µs Ecart-type 26.04 µs (with 10 runs) in [46.99 µs, 119.00 µs]
Moyenne: 740.30 µs Ecart-type 156.38 µs (with 10 runs) in [512.14 µs, 1.02 ms]
Moyenne: 10.75 µs Ecart-type 3.89 µs (with 10 runs) in [8.72 µs, 16.79 µs]
Moyenne: 94.32 µs Ecart-type 14.42 µs (with 10 runs) in [72.11 µs, 124.00 µs]
Moyenne: 67.23 µs Ecart-type 31.05 µs (with 10 runs) in [43.72 µs, 135.94 µs]
Moyenne: 91.28 µs Ecart-type 164.49 µs (with 10 runs) in [15.53 µs, 481.48 µs]
batch = 200
Moyenne: 68.82 µs Ecart-type 38.95 µs (with 10 runs) in [46.58 µs, 152.87 µs]
Moyenne: 1.59 ms Ecart-type 497.08 µs (with 10 runs) in [1.09 ms, 2.98 ms]
Moyenne: 11.66 µs Ecart-type 2.01 µs (with 10 runs) in [9.79 µs, 16.71 µs]
Moyenne: 167.67 µs Ecart-type 37.37 µs (with 10 runs) in [133.64 µs, 240.53 µs]
Moyenne: 102.09 µs Ecart-type 25.18 µs (with 10 runs) in [86.07 µs, 162.09 µs]
Moyenne: 18.04 µs Ecart-type 8.31 µs (with 10 runs) in [15.00 µs, 34.57 µs]
batch = 500
Moyenne: 63.53 µs Ecart-type 20.92 µs (with 10 runs) in [50.94 µs, 116.69 µs]
Moyenne: 3.22 ms Ecart-type 296.30 µs (with 10 runs) in [2.84 ms, 3.80 ms]
Moyenne: 13.91 µs Ecart-type 4.58 µs (with 10 runs) in [11.80 µs, 26.71 µs]
Moyenne: 410.88 µs Ecart-type 73.68 µs (with 10 runs) in [333.06 µs, 523.19 µs]
Moyenne: 263.08 µs Ecart-type 117.22 µs (with 10 runs) in [211.75 µs, 444.83 µs]
Moyenne: 22.28 µs Ecart-type 12.93 µs (with 10 runs) in [19.16 µs, 37.56 µs]
batch = 1000
Moyenne: 153.47 µs Ecart-type 43.85 µs (with 10 runs) in [125.94 µs, 229.51 µs]
Moyenne: 5.52 ms Ecart-type 389.98 µs (with 10 runs) in [4.99 ms, 6.18 ms]
Moyenne: 83.03 µs Ecart-type 22.52 µs (with 10 runs) in [73.67 µs, 95.91 µs]
Moyenne: 702.77 µs Ecart-type 76.26 µs (with 10 runs) in [661.35 µs, 888.11 µs]
Moyenne: 445.87 µs Ecart-type 53.38 µs (with 10 runs) in [420.78 µs, 548.59 µs]
Moyenne: 27.48 µs Ecart-type 6.31 µs (with 10 runs) in [26.53 µs, 29.87 µs]
batch = 2000
Moyenne: 147.73 µs Ecart-type 19.47 µs (with 10 runs) in [132.10 µs, 187.47 µs]
Moyenne: 83.71 µs Ecart-type 4.68 µs (with 10 runs) in [79.25 µs, 93.14 µs]
Moyenne: 1.58 ms Ecart-type 216.25 µs (with 10 runs) in [1.32 ms, 1.97 ms]
Moyenne: 47.31 µs Ecart-type 20.65 µs (with 10 runs) in [37.00 µs, 97.03 µs]
batch = 3000
Moyenne: 179.79 µs Ecart-type 45.35 µs (with 10 runs) in [144.02 µs, 310.74 µs]
Moyenne: 92.27 µs Ecart-type 7.05 µs (with 10 runs) in [84.71 µs, 106.64 µs]
Moyenne: 2.37 ms Ecart-type 267.39 µs (with 10 runs) in [1.99 ms, 2.91 ms]
Moyenne: 50.69 µs Ecart-type 6.32 µs (with 10 runs) in [48.65 µs, 52.22 µs]
batch = 4000
Moyenne: 193.02 µs Ecart-type 28.74 µs (with 10 runs) in [173.71 µs, 211.52 µs]
Moyenne: 100.06 µs Ecart-type 22.27 µs (with 10 runs) in [85.61 µs, 133.38 µs]
Moyenne: 3.13 ms Ecart-type 296.38 µs (with 10 runs) in [2.73 ms, 3.54 ms]
Moyenne: 64.67 µs Ecart-type 7.43 µs (with 10 runs) in [59.90 µs, 68.08 µs]
batch = 5000
Moyenne: 215.06 µs Ecart-type 46.52 µs (with 10 runs) in [196.06 µs, 411.19 µs]
Moyenne: 110.91 µs Ecart-type 8.08 µs (with 10 runs) in [90.36 µs, 122.94 µs]
Moyenne: 3.49 ms Ecart-type 212.13 µs (with 10 runs) in [3.30 ms, 4.04 ms]
Moyenne: 78.86 µs Ecart-type 5.47 µs (with 10 runs) in [77.15 µs, 102.21 µs]
batch = 10000
Moyenne: 248.75 µs Ecart-type 64.14 µs (with 10 runs) in [192.57 µs, 425.01 µs]
Moyenne: 116.55 µs Ecart-type 17.05 µs (with 10 runs) in [100.13 µs, 152.60 µs]
Moyenne: 7.18 ms Ecart-type 420.77 µs (with 10 runs) in [6.62 ms, 8.15 ms]
Moyenne: 153.30 µs Ecart-type 13.69 µs (with 10 runs) in [149.03 µs, 211.69 µs]
batch = 20000
Moyenne: 293.81 µs Ecart-type 19.49 µs (with 10 runs) in [283.46 µs, 364.31 µs]
Moyenne: 147.12 µs Ecart-type 8.23 µs (with 10 runs) in [135.43 µs, 160.67 µs]
Moyenne: 215.69 µs Ecart-type 14.46 µs (with 10 runs) in [204.68 µs, 262.99 µs]
batch = 50000
Moyenne: 1.00 ms Ecart-type 44.28 µs (with 10 runs) in [967.01 µs, 1.13 ms]
Moyenne: 503.33 µs Ecart-type 13.21 µs (with 10 runs) in [491.66 µs, 551.81 µs]
Moyenne: 1.86 ms Ecart-type 1.14 ms (with 10 runs) in [1.13 ms, 4.90 ms]
batch = 75000
Moyenne: 1.75 ms Ecart-type 153.53 µs (with 10 runs) in [1.56 ms, 1.94 ms]
Moyenne: 663.38 µs Ecart-type 20.47 µs (with 10 runs) in [630.15 µs, 700.62 µs]
Moyenne: 1.88 ms Ecart-type 173.99 µs (with 10 runs) in [1.65 ms, 2.14 ms]
batch = 100000
Moyenne: 2.56 ms Ecart-type 204.42 µs (with 10 runs) in [2.27 ms, 2.85 ms]
Moyenne: 1.21 ms Ecart-type 113.75 µs (with 10 runs) in [1.04 ms, 1.44 ms]
Moyenne: 2.98 ms Ecart-type 934.23 µs (with 10 runs) in [2.22 ms, 6.31 ms]
batch = 150000
Moyenne: 4.00 ms Ecart-type 188.08 µs (with 10 runs) in [3.78 ms, 4.46 ms]
Moyenne: 2.92 ms Ecart-type 344.26 µs (with 10 runs) in [2.54 ms, 3.93 ms]
Moyenne: 3.76 ms Ecart-type 308.56 µs (with 10 runs) in [3.26 ms, 4.52 ms]
batch = 200000
Moyenne: 5.73 ms Ecart-type 424.36 µs (with 10 runs) in [5.17 ms, 6.72 ms]
Moyenne: 4.00 ms Ecart-type 606.67 µs (with 10 runs) in [3.50 ms, 6.04 ms]
Moyenne: 5.44 ms Ecart-type 742.52 µs (with 10 runs) in [4.57 ms, 7.38 ms]
batch = 300000
Moyenne: 8.36 ms Ecart-type 1.26 ms (with 10 runs) in [7.78 ms, 13.52 ms]
Moyenne: 5.37 ms Ecart-type 352.34 µs (with 10 runs) in [5.08 ms, 6.64 ms]
Moyenne: 7.18 ms Ecart-type 680.24 µs (with 10 runs) in [6.69 ms, 8.83 ms]
batch = 400000
Moyenne: 11.49 ms Ecart-type 1.16 ms (with 10 runs) in [10.36 ms, 15.15 ms]
Moyenne: 7.87 ms Ecart-type 709.04 µs (with 10 runs) in [7.18 ms, 9.70 ms]
Moyenne: 10.51 ms Ecart-type 900.27 µs (with 10 runs) in [9.41 ms, 13.22 ms]
batch = 500000
Moyenne: 15.01 ms Ecart-type 1.90 ms (with 10 runs) in [12.99 ms, 20.81 ms]
Moyenne: 11.02 ms Ecart-type 889.69 µs (with 10 runs) in [9.64 ms, 13.29 ms]
Moyenne: 17.02 ms Ecart-type 2.13 ms (with 10 runs) in [14.72 ms, 22.19 ms]
batch = 600000
Moyenne: 21.19 ms Ecart-type 1.93 ms (with 10 runs) in [18.32 ms, 26.29 ms]
Moyenne: 12.47 ms Ecart-type 964.03 µs (with 10 runs) in [11.00 ms, 14.31 ms]
Moyenne: 18.04 ms Ecart-type 2.80 ms (with 10 runs) in [13.37 ms, 24.63 ms]

dfb = pandas.DataFrame(memo)[["average", "lib", "batch"]]
piv = dfb.pivot("batch", "lib", "average")
piv

lib	cython	numba	numpy	onnxruntime	python	sklearn
batch
1	0.000002	0.000002	0.000012	0.000019	0.000006	0.000062
10	0.000010	0.000011	0.000019	0.000020	0.000101	0.000074
100	0.000067	0.000094	0.000011	0.000091	0.000740	0.000069
200	0.000102	0.000168	0.000012	0.000018	0.001590	0.000069
500	0.000263	0.000411	0.000014	0.000022	0.003225	0.000064
1000	0.000446	0.000703	0.000083	0.000027	0.005516	0.000153
2000	NaN	0.001580	0.000084	0.000047	NaN	0.000148
3000	NaN	0.002371	0.000092	0.000051	NaN	0.000180
4000	NaN	0.003125	0.000100	0.000065	NaN	0.000193
5000	NaN	0.003490	0.000111	0.000079	NaN	0.000215
10000	NaN	0.007181	0.000117	0.000153	NaN	0.000249
20000	NaN	NaN	0.000147	0.000216	NaN	0.000294
50000	NaN	NaN	0.000503	0.001863	NaN	0.001000
75000	NaN	NaN	0.000663	0.001879	NaN	0.001749
100000	NaN	NaN	0.001209	0.002980	NaN	0.002557
150000	NaN	NaN	0.002923	0.003762	NaN	0.004001
200000	NaN	NaN	0.004001	0.005440	NaN	0.005731
300000	NaN	NaN	0.005366	0.007180	NaN	0.008365
400000	NaN	NaN	0.007872	0.010510	NaN	0.011489
500000	NaN	NaN	0.011016	0.017021	NaN	0.015013
600000	NaN	NaN	0.012468	0.018040	NaN	0.021193

for c in piv.columns:
    piv["ave_" + c] = piv[c] / piv.index
piv

lib	cython	numba	numpy	onnxruntime	python	sklearn	ave_cython	ave_numba	ave_numpy	ave_onnxruntime	ave_python	ave_sklearn
batch
1	0.000002	0.000002	0.000012	0.000019	0.000006	0.000062	1.614900e-06	1.990900e-06	1.209790e-05	1.938460e-05	0.000006	6.245340e-05
10	0.000010	0.000011	0.000019	0.000020	0.000101	0.000074	1.046460e-06	1.084320e-06	1.906010e-06	1.965810e-06	0.000010	7.396440e-06
100	0.000067	0.000094	0.000011	0.000091	0.000740	0.000069	6.722760e-07	9.431990e-07	1.075410e-07	9.127790e-07	0.000007	6.865190e-07
200	0.000102	0.000168	0.000012	0.000018	0.001590	0.000069	5.104525e-07	8.383455e-07	5.827850e-08	9.019900e-08	0.000008	3.440995e-07
500	0.000263	0.000411	0.000014	0.000022	0.003225	0.000064	5.261610e-07	8.217592e-07	2.781740e-08	4.455220e-08	0.000006	1.270610e-07
1000	0.000446	0.000703	0.000083	0.000027	0.005516	0.000153	4.458687e-07	7.027674e-07	8.303090e-08	2.747640e-08	0.000006	1.534708e-07
2000	NaN	0.001580	0.000084	0.000047	NaN	0.000148	NaN	7.899395e-07	4.185515e-08	2.365645e-08	NaN	7.386540e-08
3000	NaN	0.002371	0.000092	0.000051	NaN	0.000180	NaN	7.902492e-07	3.075760e-08	1.689707e-08	NaN	5.992867e-08
4000	NaN	0.003125	0.000100	0.000065	NaN	0.000193	NaN	7.813673e-07	2.501480e-08	1.616818e-08	NaN	4.825388e-08
5000	NaN	0.003490	0.000111	0.000079	NaN	0.000215	NaN	6.979748e-07	2.218220e-08	1.577170e-08	NaN	4.301210e-08
10000	NaN	0.007181	0.000117	0.000153	NaN	0.000249	NaN	7.180820e-07	1.165535e-08	1.533050e-08	NaN	2.487490e-08
20000	NaN	NaN	0.000147	0.000216	NaN	0.000294	NaN	NaN	7.356025e-09	1.078465e-08	NaN	1.469057e-08
50000	NaN	NaN	0.000503	0.001863	NaN	0.001000	NaN	NaN	1.006655e-08	3.725768e-08	NaN	2.000188e-08
75000	NaN	NaN	0.000663	0.001879	NaN	0.001749	NaN	NaN	8.845087e-09	2.505991e-08	NaN	2.331396e-08
100000	NaN	NaN	0.001209	0.002980	NaN	0.002557	NaN	NaN	1.208690e-08	2.980086e-08	NaN	2.556766e-08
150000	NaN	NaN	0.002923	0.003762	NaN	0.004001	NaN	NaN	1.948814e-08	2.508106e-08	NaN	2.667062e-08
200000	NaN	NaN	0.004001	0.005440	NaN	0.005731	NaN	NaN	2.000416e-08	2.720136e-08	NaN	2.865267e-08
300000	NaN	NaN	0.005366	0.007180	NaN	0.008365	NaN	NaN	1.788538e-08	2.393301e-08	NaN	2.788189e-08
400000	NaN	NaN	0.007872	0.010510	NaN	0.011489	NaN	NaN	1.967972e-08	2.627497e-08	NaN	2.872169e-08
500000	NaN	NaN	0.011016	0.017021	NaN	0.015013	NaN	NaN	2.203297e-08	3.404131e-08	NaN	3.002589e-08
600000	NaN	NaN	0.012468	0.018040	NaN	0.021193	NaN	NaN	2.077927e-08	3.006664e-08	NaN	3.532122e-08

libs = list(c for c in piv.columns if "ave_" in c)
ax = piv.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch")
ax.grid(True);

../_images/cffi_linear_regression_119_0.png

Le minimum obtenu est pour $10^{-8} s$ soit 10 ns. Cela montre que la comparaisson précédente était incomplète voire biaisée. Tout dépend de l’usage qu’on fait de la fonction de prédiction même s’il sera toujours possible de d’écrire un code spécialisé plus rapide que toute autre fonction générique. En général, plus on reste du côté Python, plus le programme est lent. Le nombre de passage de l’un à l’autre, selon la façon dont il est fait ralenti aussi. En tenant compte de cela, le programme rouge sera plus lent que le vert.

from pyquickhelper.helpgen import NbImage
NbImage("pycpp.png")

../_images/cffi_linear_regression_121_0.png

Ces résultats sont d’une façon générale assez volatile car le temps de calcul est enrobé dans plusieurs fonctions Python qui rendent une mesure précise difficile. Il reste néanmoins une bonne idée des ordres de grandeurs.

Random Forest #

On reproduit les mêmes résultats pour une random forest mais la réécriture n’est plus aussi simple qu’une régression linéaire.

Une prédiction à la fois #

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(diabetes_X_train, diabetes_y_train)

RandomForestRegressor(n_estimators=10)

memo_time = []
x = diabetes_X_test[:1]
memo_time.append(timeexe("sklearn-rf", "rf.predict(x)", repeat=100, number=20))

Moyenne: 980.23 µs Ecart-type 60.93 µs (with 20 runs) in [937.55 µs, 1.11 ms]

C’est beaucoup plus long que la régression linéaire. On essaye avec onnx.

if ok_onnx:
    onnxrf_model = convert_sklearn(
        rf, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))],
        target_opset=11)
    onnxrf_model.ir_version = 6
    save_model(onnxrf_model, 'model_rf.onnx')
    model_onnx = onnx.load('model_rf.onnx')

if ok_onnx:
    sess = onnxruntime.InferenceSession("model_rf.onnx")
    for i in sess.get_inputs():
        print('Input:', i)
    for o in sess.get_outputs():
        print('Output:', o)

    def predict_onnxrt_rf(x):
        return sess.run(["variable"], {'input': x})

    print(predict_onnxrt_rf(x.astype(numpy.float32)))
    memo_time.append(timeexe("onnx-rf", "predict_onnxrt_rf(x.astype(numpy.float32))",
                             repeat=100, number=20))

Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1])
[array([[243.00002]], dtype=float32)]
Moyenne: 14.36 µs Ecart-type 4.18 µs (with 20 runs) in [11.75 µs, 22.22 µs]

C’est beaucoup plus rapide.

import pandas
df2 = pandas.DataFrame(data=memo_time)
df2 = df2.set_index("legend").sort_values("average")
df2

	average	deviation	first	first3	last3	repeat	min5	max5	code	run
legend
onnx-rf	0.000014	0.000004	0.000047	0.000027	0.000014	100	0.000012	0.000022	predict_onnxrt_rf(x.astype(numpy.float32))	20
sklearn-rf	0.000980	0.000061	0.001308	0.001087	0.001075	100	0.000938	0.001106	rf.predict(x)	20

fig, ax = plt.subplots(1, 1, figsize=(14,4))
df2[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
                                   legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");

<ipython-input-89-8d560b1fbb3b>:5: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  ax.grid(b=True, which="major")
<ipython-input-89-8d560b1fbb3b>:6: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  ax.grid(b=True, which="minor");

../_images/cffi_linear_regression_133_1.png

Prédiction en batch #

memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
         20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
         500000, 600000]
number = 10
repeat = 10
for i in batch[:15]:
    if i <= diabetes_X_test.shape[0]:
        mx = diabetes_X_test[:i]
    else:
        mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
        mx = numpy.vstack(mxs)
        mx = mx[:i]

    print("batch", "=", i)

    memo.append(timeexe("sklearn.predict %d" % i, "rf.predict(mx)",
                        repeat=repeat, number=number))
    memo[-1]["batch"] = i
    memo[-1]["lib"] = "sklearn"

    if ok_onnx:
        memo.append(timeexe("onnxruntime %d" % i,
                            "predict_onnxrt_rf(mx.astype(numpy.float32))",
                            repeat=repeat, number=number))
        memo[-1]["batch"] = i
        memo[-1]["lib"] = "onnxruntime"

batch = 1
Moyenne: 1.11 ms Ecart-type 145.19 µs (with 10 runs) in [1.03 ms, 1.54 ms]
Moyenne: 15.70 µs Ecart-type 13.36 µs (with 10 runs) in [11.20 µs, 55.77 µs]
batch = 10
Moyenne: 1.14 ms Ecart-type 162.36 µs (with 10 runs) in [952.57 µs, 1.51 ms]
Moyenne: 25.55 µs Ecart-type 9.43 µs (with 10 runs) in [17.37 µs, 42.15 µs]
batch = 100
Moyenne: 1.09 ms Ecart-type 80.51 µs (with 10 runs) in [1.01 ms, 1.31 ms]
Moyenne: 38.04 µs Ecart-type 17.20 µs (with 10 runs) in [32.02 µs, 89.62 µs]
batch = 200
Moyenne: 1.42 ms Ecart-type 126.30 µs (with 10 runs) in [1.15 ms, 1.71 ms]
Moyenne: 82.17 µs Ecart-type 56.27 µs (with 10 runs) in [43.86 µs, 213.17 µs]
batch = 500
Moyenne: 1.79 ms Ecart-type 543.34 µs (with 10 runs) in [1.31 ms, 3.18 ms]
Moyenne: 130.31 µs Ecart-type 30.45 µs (with 10 runs) in [85.15 µs, 190.08 µs]
batch = 1000
Moyenne: 1.53 ms Ecart-type 93.12 µs (with 10 runs) in [1.42 ms, 1.70 ms]
Moyenne: 249.60 µs Ecart-type 23.96 µs (with 10 runs) in [232.24 µs, 312.27 µs]
batch = 2000
Moyenne: 2.09 ms Ecart-type 149.23 µs (with 10 runs) in [1.89 ms, 2.33 ms]
Moyenne: 393.37 µs Ecart-type 165.01 µs (with 10 runs) in [283.40 µs, 734.87 µs]
batch = 3000
Moyenne: 2.77 ms Ecart-type 921.32 µs (with 10 runs) in [2.24 ms, 5.40 ms]
Moyenne: 432.57 µs Ecart-type 16.08 µs (with 10 runs) in [422.71 µs, 479.76 µs]
batch = 4000
Moyenne: 2.96 ms Ecart-type 331.99 µs (with 10 runs) in [2.63 ms, 3.69 ms]
Moyenne: 1.04 ms Ecart-type 485.53 µs (with 10 runs) in [598.92 µs, 2.38 ms]
batch = 5000
Moyenne: 3.27 ms Ecart-type 348.48 µs (with 10 runs) in [3.00 ms, 4.16 ms]
Moyenne: 996.95 µs Ecart-type 207.84 µs (with 10 runs) in [767.12 µs, 1.47 ms]
batch = 10000
Moyenne: 5.26 ms Ecart-type 404.81 µs (with 10 runs) in [4.96 ms, 6.34 ms]
Moyenne: 1.75 ms Ecart-type 317.18 µs (with 10 runs) in [1.34 ms, 2.13 ms]
batch = 20000
Moyenne: 10.52 ms Ecart-type 1.11 ms (with 10 runs) in [9.21 ms, 13.42 ms]
Moyenne: 4.40 ms Ecart-type 522.54 µs (with 10 runs) in [3.52 ms, 5.43 ms]
batch = 50000
Moyenne: 24.33 ms Ecart-type 2.90 ms (with 10 runs) in [21.27 ms, 29.83 ms]
Moyenne: 8.21 ms Ecart-type 1.31 ms (with 10 runs) in [7.32 ms, 11.74 ms]
batch = 75000
Moyenne: 31.54 ms Ecart-type 251.81 µs (with 10 runs) in [31.19 ms, 32.06 ms]
Moyenne: 12.22 ms Ecart-type 908.21 µs (with 10 runs) in [11.38 ms, 14.24 ms]
batch = 100000
Moyenne: 42.05 ms Ecart-type 745.44 µs (with 10 runs) in [41.22 ms, 43.35 ms]
Moyenne: 16.17 ms Ecart-type 1.01 ms (with 10 runs) in [14.98 ms, 17.54 ms]

dfbrf = pandas.DataFrame(memo)[["average", "lib", "batch"]]
pivrf = dfbrf.pivot("batch", "lib", "average")
for c in pivrf.columns:
    pivrf["ave_" + c] = pivrf[c] / pivrf.index
libs = list(c for c in pivrf.columns if "ave_" in c)
ax = pivrf.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch\nrandom forest")
ax.grid(True);

../_images/cffi_linear_regression_136_0.png

Liens

Contenu

Information

Sujet précédent

Sujet suivant