Optimisation de code avec cffi, numba, cython#
Links: notebook
, html, python
, slides, GitHub
L’idée est de recoder une fonction en C. On prend comme exemple la fonction de prédiction de la régression linéaire de scikit-learn et de prévoir le gain de temps qu’on obtient en recodant la fonction dans un langage plus rapide.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
memo_time = []
import timeit
def unit(x):
if x >= 1: return "%1.2f s" % x
elif x >= 1e-3: return "%1.2f ms" % (x* 1000)
elif x >= 1e-6: return "%1.2f µs" % (x* 1000**2)
elif x >= 1e-9: return "%1.2f ns" % (x* 1000**3)
else:
return "%1.2g s" % x
def timeexe(legend, code, number=100, repeat=1000):
rep = timeit.repeat(code, number=number, repeat=repeat, globals=globals())
ave = sum(rep) / (number * repeat)
std = (sum((x/number - ave)**2 for x in rep) / repeat)**0.5
fir = rep[0]/number
fir3 = sum(rep[:3]) / (3 * number)
las3 = sum(rep[-3:]) / (3 * number)
rep.sort()
mini = rep[len(rep)//20] / number
maxi = rep[-len(rep)//20] / number
print("Moyenne: %s Ecart-type %s (with %d runs) in [%s, %s]" % (
unit(ave), unit(std), number, unit(mini), unit(maxi)))
return dict(legend=legend, average=ave, deviation=std, first=fir, first3=fir3,
last3=las3, repeat=repeat, min5=mini, max5=maxi, code=code, run=number)
Régression linéaire#
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
diabetes = load_diabetes()
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes.data, diabetes.target)
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(diabetes_X_train, diabetes_y_train)
LinearRegression()
clr.coef_
array([ -35.81159278, -267.39308261, 503.56121841, 337.87944184,
-577.27255236, 373.62939477, -99.69779327, 78.39842094,
656.54309153, 80.3383998 ])
clr.intercept_
152.69613239933642
z = diabetes_X_test[0:1,:]
memo_time.append(timeexe("sklearn.predict", "clr.predict(z)"))
Moyenne: 45.50 µs Ecart-type 6.34 µs (with 100 runs) in [40.87 µs, 52.95 µs]
%timeit clr.predict(z)
45.2 µs ± 744 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
optimisation avec cffi#
On s’inspire de l’exemple Purely for performance (API level, out-of-line).
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg(int, double *, double *, double, double *);")
ffibuilder.set_source("_linear_regression",
r"""
static int linreg(int dimension, double * x, double *coef, double intercept, double * out)
{
for(; dimension > 0; --dimension, ++x, ++coef)
intercept += *x * *coef;
*out = intercept;
return 1;
}
""")
ffibuilder.compile(verbose=True)
generating ._linear_regression.c (already up-to-date) the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a' running build_ext building '_linear_regression' extension C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression.c /Fo.Release_linear_regression.obj C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression .Release_linear_regression.obj /OUT:._linear_regression.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression.cp39-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression.cp39-win_amd64.pyd'
La fonction compilée est accessible comme suit.
from _linear_regression import ffi, lib
lib.linreg
<function _linear_regression.Lib.linreg>
On s’inspire de l’exemple How to pass a Numpy array into a cffi function and how to get one back out?.
import numpy
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
x = diabetes_X_test[0:1,:]
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
n = len(clr.coef_)
lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)
1
out
array([214.72477745])
On vérifie qu’on obtient bien la même chose.
clr.predict(x)
array([214.72477745])
Et on mesure le temps d’exécution :
memo_time.append(timeexe("cffi-linreg", "lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)"))
Moyenne: 831.37 ns Ecart-type 708.08 ns (with 100 runs) in [416.00 ns, 1.52 µs]
C’est beaucoup plus rapide. Pour être totalement honnête, il faut mesurer les étapes qui consiste à extraire les pointeurs.
def predict_clr(x, clr):
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
lib.linreg(len(x), cptr_x, cptr_coef, clr.intercept_, cptr_out)
return out
predict_clr(x, clr)
array([154.32457426])
memo_time.append(timeexe("cffi-linreg-wrapped", "predict_clr(x, clr)"))
Moyenne: 7.52 µs Ecart-type 2.34 µs (with 100 runs) in [6.20 µs, 10.42 µs]
Cela reste plus rapide.
cffi - seconde version#
Comme on construit la fonction en dynamique (le code est connu lors de l’exécution), on peut facilement se passer de la boucle et écrire le code sans boucle et avec les coefficients.
res = " + ".join("{0}*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'-35.81159277952622*x[0] + -267.39308260812277*x[1] + 503.56121841083586*x[2] + 337.87944183803455*x[3] + -577.2725523621144*x[4] + 373.6293947654621*x[5] + -99.69779326605845*x[6] + 78.39842093764699*x[7] + 656.5430915289373*x[8] + 80.33839980437061*x[9]'
code = """
static int linreg_custom(double * x, double * out)
{{
out[0] = {0} + {1};
}}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom(double * x, double * out)
{
out[0] = 152.69613239933642 + -35.81159277952622*x[0] + -267.39308260812277*x[1] + 503.56121841083586*x[2] + 337.87944183803455*x[3] + -577.2725523621144*x[4] + 373.6293947654621*x[5] + -99.69779326605845*x[6] + 78.39842093764699*x[7] + 656.5430915289373*x[8] + 80.33839980437061*x[9];
}
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg_custom(double *, double *);")
ffibuilder.set_source("_linear_regression_custom", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom.c the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a' running build_ext building '_linear_regression_custom' extension C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression_custom.c /Fo.Release_linear_regression_custom.obj C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression_custom .Release_linear_regression_custom.obj /OUT:._linear_regression_custom.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom.cp39-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom.cp39-win_amd64.pyd'
from _linear_regression_custom.lib import linreg_custom
linreg_custom(cptr_x, cptr_out)
out
array([214.72477745])
memo_time.append(timeexe("cffi-linreg-custom", "linreg_custom(cptr_x, cptr_out)"))
Moyenne: 466.52 ns Ecart-type 851.96 ns (with 100 runs) in [315.00 ns, 715.00 ns]
On a gagné un facteur 2.
def predict_clr_custom(x):
out = numpy.zeros(1)
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast("double*", ptr_x)
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast("double*", ptr_out)
linreg_custom(cptr_x, cptr_out)
return out
predict_clr_custom(x)
array([214.72477745])
memo_time.append(timeexe("cffi-linreg-custom wrapped", "predict_clr_custom(x)"))
Moyenne: 5.27 µs Ecart-type 1.82 µs (with 100 runs) in [4.42 µs, 7.77 µs]
C’est un peu plus rapide.
et en float?#
L’ordinateur fait la distinction entre les double code sur 64 bit et les float codé sur 32 bits. La précision est meilleure dans le premier cas et les calculs sont plus rapides dans le second. Dans le cas du machine learning, on préfère la rapidité à une perte précision en précision qui est souvent compensée par l’optimisation inhérente à tout problème de machine learning. Ce qu’on perd sur une observation, on le retrouve sur une autre.
res = " + ".join("{0}f*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'-35.81159277952622f*x[0] + -267.39308260812277f*x[1] + 503.56121841083586f*x[2] + 337.87944183803455f*x[3] + -577.2725523621144f*x[4] + 373.6293947654621f*x[5] + -99.69779326605845f*x[6] + 78.39842093764699f*x[7] + 656.5430915289373f*x[8] + 80.33839980437061f*x[9]'
code = """
static int linreg_custom_float(float * x, float * out)
{{
out[0] = {0}f + {1};
}}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom_float(float * x, float * out)
{
out[0] = 152.69613239933642f + -35.81159277952622f*x[0] + -267.39308260812277f*x[1] + 503.56121841083586f*x[2] + 337.87944183803455f*x[3] + -577.2725523621144f*x[4] + 373.6293947654621f*x[5] + -99.69779326605845f*x[6] + 78.39842093764699f*x[7] + 656.5430915289373f*x[8] + 80.33839980437061f*x[9];
}
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg_custom_float(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float.c the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a' running build_ext building '_linear_regression_custom_float' extension C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression_custom_float.c /Fo.Release_linear_regression_custom_float.obj C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression_custom_float .Release_linear_regression_custom_float.obj /OUT:._linear_regression_custom_float.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float.cp39-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float.cp39-win_amd64.pyd'
from _linear_regression_custom_float.lib import linreg_custom_float
def predict_clr_custom_float(x):
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )
linreg_custom_float(cptr_x, cptr_out)
return out
Avant d’appeler la fonction, on doit transformer le vecteur iniatial en float32.
x32 = x.astype(numpy.float32)
predict_clr_custom(x32)
array([1.27301276e+31])
memo_time.append(timeexe("cffi-linreg-custom-float wrapped", "predict_clr_custom(x32)"))
Moyenne: 5.12 µs Ecart-type 1.60 µs (with 100 runs) in [4.48 µs, 6.44 µs]
La différence n’est pas flagrante. Mesurons le code C uniquement même si la partie Python ne peut pas être complètement évitée.
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )
memo_time.append(timeexe("cffi-linreg-custom-float32", "linreg_custom_float(cptr_x, cptr_out)"))
Moyenne: 389.19 ns Ecart-type 226.75 ns (with 100 runs) in [317.00 ns, 577.00 ns]
La différence n’est pas significative.
SIMD#
C’est un ensemble d’instructions processeur pour faire des opérations terme à terme sur 4 float32 aussi rapidement qu’une seule. Le processeur ne peut faire des opérations que les nombres sont copiés dans ses registres. Le programme passe alors son temps à copier des nombres depuis la mémoire vers les registres du processeur puis à faire la copie dans le chemin inverse pour le résultat. Les instructions SIMD font gagner du temps du niveau du calcul. Au lieu de faire 4 opérations de multiplication terme à terme, il n’en fait plus qu’une. Il suffit de savoir comment utiliser ces instructions. Avec Visual Studio, elles sont accessible via ces fonctions Memory and Initialization Using Streaming SIMD Extensions. Le code suivant n’est probablement pas optimal mais il n’est pas trop compliqué à suivre.
code = """
#include <xmmintrin.h>
static int linreg_custom_float_simd(float * x, float * out)
{
__m128 c1 = _mm_set_ps(0.3034995490664121f, -237.63931533353392f, 510.5306054362245f, 327.7369804093466f);
__m128 c2 = _mm_set_ps(-814.1317093725389f, 492.81458798373245f, 102.84845219168025f, 184.60648905984064f);
__m128 r1 = _mm_set_ss(152.76430691633442f);
r1 = _mm_add_ss(r1, _mm_mul_ps(c1, _mm_load_ps(x)));
r1 = _mm_add_ss(r1, _mm_mul_ps(c2, _mm_load_ps(x+4)));
float r[4];
_mm_store_ps(r, r1);
out[0] = r[0] + r[1] + r[2] + r[3] + 743.5196167505419f * x[8] + 76.095172216624f * x[9];
return 1;
}
"""
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg_custom_float_simd(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float_simd", code)
ffibuilder.compile(verbose=True)
generating ._linear_regression_custom_float_simd.c (already up-to-date) the current directory is 'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a' running build_ext building '_linear_regression_custom_float_simd' extension C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:Python395_x64include -IC:Python395_x64include -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFCinclude -IC:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037include -IC:Program Files (x86)Windows KitsNETFXSDK4.8includeum -IC:Program Files (x86)Windows Kits10include10.0.19041.0ucrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0shared -IC:Program Files (x86)Windows Kits10include10.0.19041.0um -IC:Program Files (x86)Windows Kits10include10.0.19041.0winrt -IC:Program Files (x86)Windows Kits10include10.0.19041.0cppwinrt /Tc_linear_regression_custom_float_simd.c /Fo.Release_linear_regression_custom_float_simd.obj C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037binHostX86x64link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:Python395_x64libs /LIBPATH:C:Python395_x64PCbuildamd64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037ATLMFClibx64 /LIBPATH:C:Program Files (x86)Microsoft Visual Studio2019CommunityVCToolsMSVC14.29.30037libx64 /LIBPATH:C:Program Files (x86)Windows KitsNETFXSDK4.8libumx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0ucrtx64 /LIBPATH:C:Program Files (x86)Windows Kits10lib10.0.19041.0umx64 /EXPORT:PyInit__linear_regression_custom_float_simd .Release_linear_regression_custom_float_simd.obj /OUT:._linear_regression_custom_float_simd.cp39-win_amd64.pyd /IMPLIB:.Release_linear_regression_custom_float_simd.cp39-win_amd64.lib
'C:\xavierdupre\__home_\GitHub\ensae_teaching_cs\_doc\notebooks\2a\_linear_regression_custom_float_simd.cp39-win_amd64.pyd'
from _linear_regression_custom_float_simd.lib import linreg_custom_float_simd
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )
linreg_custom_float_simd(cptr_x, cptr_out)
out
array([172.00832], dtype=float32)
memo_time.append(timeexe("cffi-linreg-custom-float32-simd", "linreg_custom_float_simd(cptr_x, cptr_out)"))
Moyenne: 418.99 ns Ecart-type 387.18 ns (with 100 runs) in [299.00 ns, 631.00 ns]
C’est légèrement mieux, quelques références :
aligned_vs_unaligned_load.c : c’est du code mais facile à lire.
Les processeurs évoluent au fil du temps, 4 float, 8 float, SIMD2, FMA4 Intrinsics Added for Visual Studio 2010 SP1, AVX.
Réécriture purement Python#
On continue avec uniquement du Python sans numpy.
coef = clr.coef_
list(coef)
[-35.81159277952622,
-267.39308260812277,
503.56121841083586,
337.87944183803455,
-577.2725523621144,
373.6293947654621,
-99.69779326605845,
78.39842093764699,
656.5430915289373,
80.33839980437061]
code = str(clr.intercept_) + "+" + "+".join("x[{0}]*({1})".format(i, c) for i, c in enumerate(coef))
code
'152.69613239933642+x[0]*(-35.81159277952622)+x[1]*(-267.39308260812277)+x[2]*(503.56121841083586)+x[3]*(337.87944183803455)+x[4]*(-577.2725523621144)+x[5]*(373.6293947654621)+x[6]*(-99.69779326605845)+x[7]*(78.39842093764699)+x[8]*(656.5430915289373)+x[9]*(80.33839980437061)'
def predict_clr_python(x):
return 152.764306916+x[0]*0.3034995490664121+x[1]*(-237.63931533353392)+x[2]*510.5306054362245+ \
x[3]*327.7369804093466+ \
x[4]*(-814.1317093725389)+x[5]*492.81458798373245+x[6]*102.84845219168025+ \
x[7]*184.60648905984064+x[8]*743.5196167505419+x[9]*76.095172216624
predict_clr_python(x[0])
211.03463170273153
z = list(x[0])
memo_time.append(timeexe("python-linreg-custom", "predict_clr_python(z)"))
Moyenne: 2.02 µs Ecart-type 670.45 ns (with 100 runs) in [1.70 µs, 2.73 µs]
De façon assez surprenante, c’est plutôt rapide. Et si on y mettait une boucle.
def predict_clr_python_loop(x, coef, intercept):
return intercept + sum(a*b for a, b in zip(x, coef))
predict_clr_python_loop(x[0], list(clr.coef_), clr.intercept_)
214.72477744760596
coef = list(clr.coef_)
intercept = clr.intercept_
memo_time.append(timeexe("python-linreg", "predict_clr_python_loop(z, coef, intercept)"))
Moyenne: 3.54 µs Ecart-type 1.31 µs (with 100 runs) in [2.68 µs, 6.16 µs]
A peine plus long.
Réécriture avec Python et numpy#
def predict_clr_numpy(x, coef, intercept):
return intercept + numpy.dot(coef, x).sum()
predict_clr_numpy(x[0], clr.coef_, clr.intercept_)
214.72477744760596
memo_time.append(timeexe("numpy-linreg-numpy", "predict_clr_numpy(z, coef, clr.intercept_)"))
Moyenne: 8.08 µs Ecart-type 3.44 µs (with 100 runs) in [6.44 µs, 12.16 µs]
Les dimensions des tableaux sont trop petites pour que le calcul matriciel apporte une différence. On se retrouve dans le cas cffi où les échanges Python - C grignotent tout le temps de calcul.
numba#
numba essaye de compiler à la volée des
bouts de codes écrits en Python. On induque quelle fonction optimiser en
faisant précéder la fonction de @jit
. Toutes les écritures ne
fonctionnent, typiquement, certaines listes en compréhension soulèvent
une exception. Il faut donc écrire son code en Python d’une façon assez
proche de ce qu’il serait en C.
from numba import jit
@jit
def predict_clr_numba(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
predict_clr_numba(z, clr.coef_, clr.intercept_)
C:Python395_x64libsite-packagesnumbacoreir_utils.py:2152: NumbaPendingDeprecationWarning: [1m Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'x' of function 'predict_clr_numba'. For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types [1m File "<ipython-input-50-3bf9efb9c9c6>", line 2:[0m [1m@jit [1mdef predict_clr_numba(x, coef, intercept): [0m[1m^[0m[0m [0m warnings.warn(NumbaPendingDeprecationWarning(msg, loc=loc))
214.724777447606
memo_time.append(timeexe("numba-linreg-notype", "predict_clr_numba(z, clr.coef_, clr.intercept_)"))
Moyenne: 23.77 µs Ecart-type 7.36 µs (with 100 runs) in [19.99 µs, 37.64 µs]
Plutôt rapide !
@jit('double(double[:], double[:], double)')
def predict_clr_numba_cast(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)
214.724777447606
memo_time.append(timeexe("numba-linreg-type", "predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 948.20 ns Ecart-type 411.47 ns (with 100 runs) in [759.00 ns, 1.68 µs]
On voit que plus on donne d’information au compilateur, plus il est capable d’optimiser.
@jit('float32(float32[:], float32[:], float32)')
def predict_clr_numba_cast_float(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
x32 = x[0].astype(numpy.float32)
c32 = clr.coef_.astype(numpy.float32)
i32 = numpy.float32(clr.intercept_)
predict_clr_numba_cast_float(x32, c32, i32)
214.7247772216797
memo_time.append(timeexe("numba-linreg-type-float32", "predict_clr_numba_cast_float(x32, c32, i32)"))
Moyenne: 707.08 ns Ecart-type 268.64 ns (with 100 runs) in [565.00 ns, 1.25 µs]
On essaye avec les coefficients dans la fonction.
@jit('double(double[:])')
def predict_clr_numba_cast_custom(x):
coef = [ 3.03499549e-01, -2.37639315e+02, 5.10530605e+02, 3.27736980e+02,
-8.14131709e+02, 4.92814588e+02, 1.02848452e+02, 1.84606489e+02,
7.43519617e+02, 7.60951722e+01]
s = 152.76430691633442
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
predict_clr_numba_cast_custom(x[0])
211.034631692416
memo_time.append(timeexe("numba-linreg-type-custom", "predict_clr_numba_cast_custom(x[0])"))
Moyenne: 824.35 ns Ecart-type 371.36 ns (with 100 runs) in [652.00 ns, 1.56 µs]
On se rapproche des temps obtenus avec cffi sans wrapping, cela signifie que numba fait un bien meilleur travail à ce niveau que le wrapper rapidement créé.
@jit('double(double[:], double[:], double)')
def predict_clr_numba_numpy(x, coef, intercept):
return intercept + numpy.dot(coef, x).sum()
predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)
<ipython-input-58-7020de83c055>:1: NumbaWarning: [1m Compilation is falling back to object mode WITH looplifting enabled because Function "predict_clr_numba_numpy" failed type inference due to: [1m[1mUnknown attribute 'sum' of type float64 [1m File "<ipython-input-58-7020de83c055>", line 3:[0m [1mdef predict_clr_numba_numpy(x, coef, intercept): [1m return intercept + numpy.dot(coef, x).sum() [0m [1m^[0m[0m [0m [0m[1mDuring: typing of get attribute at <ipython-input-58-7020de83c055> (3)[0m [1m File "<ipython-input-58-7020de83c055>", line 3:[0m [1mdef predict_clr_numba_numpy(x, coef, intercept): [1m return intercept + numpy.dot(coef, x).sum() [0m [1m^[0m[0m [0m @jit('double(double[:], double[:], double)') C:Python395_x64libsite-packagesnumbacoreobject_mode_passes.py:151: NumbaWarning: [1mFunction "predict_clr_numba_numpy" was compiled in object mode without forceobj=True. [1m File "<ipython-input-58-7020de83c055>", line 2:[0m [1m@jit('double(double[:], double[:], double)') [1mdef predict_clr_numba_numpy(x, coef, intercept): [0m[1m^[0m[0m [0m warnings.warn(errors.NumbaWarning(warn_msg, C:Python395_x64libsite-packagesnumbacoreobject_mode_passes.py:161: NumbaDeprecationWarning: [1m Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour. For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit [1m File "<ipython-input-58-7020de83c055>", line 2:[0m [1m@jit('double(double[:], double[:], double)') [1mdef predict_clr_numba_numpy(x, coef, intercept): [0m[1m^[0m[0m [0m warnings.warn(errors.NumbaDeprecationWarning(msg,
214.72477744760596
memo_time.append(timeexe("numba-linreg-type-numpy", "predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 5.15 µs Ecart-type 1.78 µs (with 100 runs) in [4.37 µs, 6.00 µs]
numba est moins performant quand numpy est impliqué car le code de numpy n’est pas réécrit, il est appelé.
cython#
cython permet de créer des extensions C de plus grande envergure que numba. C’est l’option choisie par scikit-learn. Il vaut mieux connaître le C pour s’en servir et là encore, l’objectif est de réduire les échanges Python / C qui coûtent cher.
%load_ext cython
%%cython
def predict_clr_cython(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
predict_clr_cython(x[0], clr.coef_, clr.intercept_)
214.724777447606
memo_time.append(timeexe("cython-linreg", "predict_clr_cython(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 2.71 µs Ecart-type 1.60 µs (with 100 runs) in [1.92 µs, 7.19 µs]
Cython fait moins bien que numba dans notre cas et l’optimisation proposée est assez proche du temps déjà obtenue avec le langage Python seul. Cela est dû au fait que la plupart des objets tels que du code associé aux listes ou aux dictionnaires ont été réécrits en C.
%%cython
cimport numpy as npc
def predict_clr_cython_type(npc.ndarray[double, ndim=1, mode='c'] x,
npc.ndarray[double, ndim=1, mode='c'] coef,
double intercept):
cdef double s = intercept
for i in range(0, x.shape[0]):
s += coef[i] * x[i]
return s
predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)
214.724777447606
memo_time.append(timeexe(
"cython-linreg-type", "predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 721.31 ns Ecart-type 399.10 ns (with 100 runs) in [533.00 ns, 1.44 µs]
Le temps est quasi identique avec un écart type moins grand de façon significative.
Une dernière option : ONNX#
ONNX est un format de sérialisation qui permet de décrire un modèle de modèle de machine learning ou de deep learning. Cela permet de dissocer le modèle de la librairie qui a servi à le produire (voir ML.net and ONNX).
try:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime
import onnx
ok_onnx = True
print("onnx, skl2onnx, onnxruntime sont disponibles.")
def save_model(onnx_model, filename):
with open(filename, "wb") as f:
f.write(onnx_model.SerializeToString())
except ImportError as e:
print("La suite requiert onnx, skl2onnx et onnxruntime.")
print(e)
ok_onnx = False
Error in sys.excepthook: Traceback (most recent call last): File "C:Python395_x64libsite-packagesIPythoncoreinteractiveshell.py", line 1934, in showtraceback stb = value._render_traceback_() AttributeError: 'RuntimeError' object has no attribute '_render_traceback_' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "C:Python395_x64libsite-packagesIPythoncoreinteractiveshell.py", line 1936, in showtraceback stb = self.InteractiveTB.structured_traceback(etype, File "C:Python395_x64libsite-packagesIPythoncoreultratb.py", line 1105, in structured_traceback return FormattedTB.structured_traceback( File "C:Python395_x64libsite-packagesIPythoncoreultratb.py", line 999, in structured_traceback return VerboseTB.structured_traceback( File "C:Python395_x64libsite-packagesIPythoncoreultratb.py", line 851, in structured_traceback assert etb is not None AssertionError Original exception was: RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe
onnx, skl2onnx, onnxruntime sont disponibles.
On convertit le modèle au format ONNX.
if ok_onnx:
onnx_model = convert_sklearn(
clr, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))],
target_opset=11)
onnx_model.ir_version = 6
save_model(onnx_model, 'model.onnx')
model_onnx = onnx.load('model.onnx')
print("Modèle sérialisé au format ONNX")
print(model_onnx)
else:
print("onnx, onnxmltools, onnxruntime sont disponibles.")
Modèle sérialisé au format ONNX
ir_version: 6
producer_name: "skl2onnx"
producer_version: "1.10.4"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
node {
input: "input"
output: "variable"
name: "LinearRegressor"
op_type: "LinearRegressor"
attribute {
name: "coefficients"
floats: -35.81159210205078
floats: -267.3930969238281
floats: 503.56121826171875
floats: 337.87945556640625
floats: -577.2725219726562
floats: 373.62939453125
floats: -99.69779205322266
floats: 78.39842224121094
floats: 656.5430908203125
floats: 80.3384017944336
type: FLOATS
}
attribute {
name: "intercepts"
floats: 152.69613647460938
type: FLOATS
}
domain: "ai.onnx.ml"
}
name: "model"
input {
name: "input"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 10
}
}
}
}
}
output {
name: "variable"
type {
tensor_type {
elem_type: 1
shape {
dim {
}
dim {
dim_value: 1
}
}
}
}
}
}
opset_import {
domain: "ai.onnx.ml"
version: 1
}
opset_import {
domain: ""
version: 11
}
On calcule les prédictions. Le module {onnxruntime](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) optimise les calculs pour des modèles de deep learning. Cela explique pourquoi tous les calculs sont réalisés avec des réels représentés sur 4 octets numpy.float32.
if ok_onnx:
sess = onnxruntime.InferenceSession("model.onnx")
for i in sess.get_inputs():
print('Input:', i)
for o in sess.get_outputs():
print('Output:', o)
def predict_onnxrt(x):
return sess.run(["variable"], {'input': x})
print("Prediction:", predict_onnxrt(x.astype(numpy.float32)))
Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1])
Prediction: [array([[214.72478]], dtype=float32)]
if ok_onnx:
x32 = x.astype(numpy.float32)
memo_time.append(timeexe("onnxruntime-float32", "predict_onnxrt(x32)"))
memo_time.append(timeexe("onnxruntime-float64", "predict_onnxrt(x.astype(numpy.float32))"))
Moyenne: 13.00 µs Ecart-type 7.69 µs (with 100 runs) in [9.71 µs, 23.64 µs]
Moyenne: 12.69 µs Ecart-type 1.93 µs (with 100 runs) in [11.29 µs, 16.23 µs]
Récapitulatif#
import pandas
df = pandas.DataFrame(data=memo_time)
df = df.set_index("legend").sort_values("average")
df
average | deviation | first | first3 | last3 | repeat | min5 | max5 | code | run | |
---|---|---|---|---|---|---|---|---|---|---|
legend | ||||||||||
cffi-linreg-custom-float32 | 3.891910e-07 | 2.267541e-07 | 5.608000e-06 | 2.196000e-06 | 3.763333e-07 | 1000 | 3.170000e-07 | 5.770000e-07 | linreg_custom_float(cptr_x, cptr_out) | 100 |
cffi-linreg-custom-float32-simd | 4.189890e-07 | 3.871792e-07 | 1.091200e-05 | 3.949667e-06 | 2.996667e-07 | 1000 | 2.990000e-07 | 6.310000e-07 | linreg_custom_float_simd(cptr_x, cptr_out) | 100 |
cffi-linreg-custom | 4.665150e-07 | 8.519581e-07 | 2.679900e-05 | 9.352667e-06 | 3.256667e-07 | 1000 | 3.150000e-07 | 7.150000e-07 | linreg_custom(cptr_x, cptr_out) | 100 |
numba-linreg-type-float32 | 7.070790e-07 | 2.686359e-07 | 1.162000e-06 | 1.083333e-06 | 5.663333e-07 | 1000 | 5.650000e-07 | 1.249000e-06 | predict_clr_numba_cast_float(x32, c32, i32) | 100 |
cython-linreg-type | 7.213150e-07 | 3.991047e-07 | 1.252000e-06 | 8.300000e-07 | 5.513333e-07 | 1000 | 5.330000e-07 | 1.443000e-06 | predict_clr_cython_type(x[0], clr.coef_, clr.i... | 100 |
numba-linreg-type-custom | 8.243540e-07 | 3.713608e-07 | 7.940000e-07 | 7.156667e-07 | 6.543333e-07 | 1000 | 6.520000e-07 | 1.558000e-06 | predict_clr_numba_cast_custom(x[0]) | 100 |
cffi-linreg | 8.313670e-07 | 7.080831e-07 | 6.414000e-06 | 3.244000e-06 | 4.170000e-07 | 1000 | 4.160000e-07 | 1.519000e-06 | lib.linreg(n, cptr_x, cptr_coef, clr.intercept... | 100 |
numba-linreg-type | 9.482040e-07 | 4.114651e-07 | 9.350000e-07 | 8.663333e-07 | 7.596667e-07 | 1000 | 7.590000e-07 | 1.678000e-06 | predict_clr_numba_cast(x[0], clr.coef_, clr.in... | 100 |
python-linreg-custom | 2.018942e-06 | 6.704544e-07 | 5.511000e-06 | 4.254667e-06 | 1.703667e-06 | 1000 | 1.696000e-06 | 2.731000e-06 | predict_clr_python(z) | 100 |
cython-linreg | 2.706254e-06 | 1.597806e-06 | 5.083000e-06 | 5.419333e-06 | 2.126000e-06 | 1000 | 1.920000e-06 | 7.194000e-06 | predict_clr_cython(x[0], clr.coef_, clr.interc... | 100 |
python-linreg | 3.539523e-06 | 1.306156e-06 | 8.761000e-06 | 7.510000e-06 | 2.779667e-06 | 1000 | 2.681000e-06 | 6.164000e-06 | predict_clr_python_loop(z, coef, intercept) | 100 |
cffi-linreg-custom-float wrapped | 5.123886e-06 | 1.598363e-06 | 1.200400e-05 | 1.176767e-05 | 4.483000e-06 | 1000 | 4.477000e-06 | 6.436000e-06 | predict_clr_custom(x32) | 100 |
numba-linreg-type-numpy | 5.147404e-06 | 1.775723e-06 | 1.874100e-05 | 1.572433e-05 | 4.474333e-06 | 1000 | 4.374000e-06 | 5.996000e-06 | predict_clr_numba_numpy(x[0], clr.coef_, clr.i... | 100 |
cffi-linreg-custom wrapped | 5.274568e-06 | 1.823247e-06 | 2.166200e-05 | 2.268700e-05 | 5.626667e-06 | 1000 | 4.422000e-06 | 7.773000e-06 | predict_clr_custom(x) | 100 |
cffi-linreg-wrapped | 7.519599e-06 | 2.343424e-06 | 1.580000e-05 | 2.028933e-05 | 6.263333e-06 | 1000 | 6.201000e-06 | 1.041900e-05 | predict_clr(x, clr) | 100 |
numpy-linreg-numpy | 8.081947e-06 | 3.442724e-06 | 3.679000e-05 | 3.075167e-05 | 6.525667e-06 | 1000 | 6.442000e-06 | 1.216200e-05 | predict_clr_numpy(z, coef, clr.intercept_) | 100 |
onnxruntime-float64 | 1.269215e-05 | 1.926911e-06 | 1.742200e-05 | 1.337233e-05 | 1.133667e-05 | 1000 | 1.129500e-05 | 1.623200e-05 | predict_onnxrt(x.astype(numpy.float32)) | 100 |
onnxruntime-float32 | 1.299773e-05 | 7.686900e-06 | 2.281400e-05 | 1.689933e-05 | 1.009533e-05 | 1000 | 9.713000e-06 | 2.363700e-05 | predict_onnxrt(x32) | 100 |
numba-linreg-notype | 2.376539e-05 | 7.362380e-06 | 3.079800e-05 | 2.445400e-05 | 3.723367e-05 | 1000 | 1.998900e-05 | 3.763900e-05 | predict_clr_numba(z, clr.coef_, clr.intercept_) | 100 |
sklearn.predict | 4.550096e-05 | 6.337585e-06 | 7.724200e-05 | 6.447133e-05 | 4.143867e-05 | 1000 | 4.087300e-05 | 5.295400e-05 | clr.predict(z) | 100 |
On enlève quelques colonnes et on rappelle :
cffi: signifie optimisé avec cffi
custom: pas de boucle mais la fonction ne peut prédire qu’une seule régression linéaire
float32: utilise des float et non des double
linreg: régression linéaire
numba: optimisation avec numba
numpy: optimisation avec numpy
python: pas de C, que du python
simd: optimisé avec les instructions SIMD
sklearn: fonction sklearn.predict
static: la fonction utilise des variables statiques
type: la fonction est typée et ne fonctionne qu’avec un type précis en entrée.
wrapped: code optimisé mais embabllé dans une fonction Python qui elle ne l’est pas (les containers sont recréés à chaque fois)
cols = ["average", "deviation", "min5", "max5", "run", "code"]
df[cols]
average | deviation | min5 | max5 | run | code | |
---|---|---|---|---|---|---|
legend | ||||||
cffi-linreg-custom-float32 | 3.891910e-07 | 2.267541e-07 | 3.170000e-07 | 5.770000e-07 | 100 | linreg_custom_float(cptr_x, cptr_out) |
cffi-linreg-custom-float32-simd | 4.189890e-07 | 3.871792e-07 | 2.990000e-07 | 6.310000e-07 | 100 | linreg_custom_float_simd(cptr_x, cptr_out) |
cffi-linreg-custom | 4.665150e-07 | 8.519581e-07 | 3.150000e-07 | 7.150000e-07 | 100 | linreg_custom(cptr_x, cptr_out) |
numba-linreg-type-float32 | 7.070790e-07 | 2.686359e-07 | 5.650000e-07 | 1.249000e-06 | 100 | predict_clr_numba_cast_float(x32, c32, i32) |
cython-linreg-type | 7.213150e-07 | 3.991047e-07 | 5.330000e-07 | 1.443000e-06 | 100 | predict_clr_cython_type(x[0], clr.coef_, clr.i... |
numba-linreg-type-custom | 8.243540e-07 | 3.713608e-07 | 6.520000e-07 | 1.558000e-06 | 100 | predict_clr_numba_cast_custom(x[0]) |
cffi-linreg | 8.313670e-07 | 7.080831e-07 | 4.160000e-07 | 1.519000e-06 | 100 | lib.linreg(n, cptr_x, cptr_coef, clr.intercept... |
numba-linreg-type | 9.482040e-07 | 4.114651e-07 | 7.590000e-07 | 1.678000e-06 | 100 | predict_clr_numba_cast(x[0], clr.coef_, clr.in... |
python-linreg-custom | 2.018942e-06 | 6.704544e-07 | 1.696000e-06 | 2.731000e-06 | 100 | predict_clr_python(z) |
cython-linreg | 2.706254e-06 | 1.597806e-06 | 1.920000e-06 | 7.194000e-06 | 100 | predict_clr_cython(x[0], clr.coef_, clr.interc... |
python-linreg | 3.539523e-06 | 1.306156e-06 | 2.681000e-06 | 6.164000e-06 | 100 | predict_clr_python_loop(z, coef, intercept) |
cffi-linreg-custom-float wrapped | 5.123886e-06 | 1.598363e-06 | 4.477000e-06 | 6.436000e-06 | 100 | predict_clr_custom(x32) |
numba-linreg-type-numpy | 5.147404e-06 | 1.775723e-06 | 4.374000e-06 | 5.996000e-06 | 100 | predict_clr_numba_numpy(x[0], clr.coef_, clr.i... |
cffi-linreg-custom wrapped | 5.274568e-06 | 1.823247e-06 | 4.422000e-06 | 7.773000e-06 | 100 | predict_clr_custom(x) |
cffi-linreg-wrapped | 7.519599e-06 | 2.343424e-06 | 6.201000e-06 | 1.041900e-05 | 100 | predict_clr(x, clr) |
numpy-linreg-numpy | 8.081947e-06 | 3.442724e-06 | 6.442000e-06 | 1.216200e-05 | 100 | predict_clr_numpy(z, coef, clr.intercept_) |
onnxruntime-float64 | 1.269215e-05 | 1.926911e-06 | 1.129500e-05 | 1.623200e-05 | 100 | predict_onnxrt(x.astype(numpy.float32)) |
onnxruntime-float32 | 1.299773e-05 | 7.686900e-06 | 9.713000e-06 | 2.363700e-05 | 100 | predict_onnxrt(x32) |
numba-linreg-notype | 2.376539e-05 | 7.362380e-06 | 1.998900e-05 | 3.763900e-05 | 100 | predict_clr_numba(z, clr.coef_, clr.intercept_) |
sklearn.predict | 4.550096e-05 | 6.337585e-06 | 4.087300e-05 | 5.295400e-05 | 100 | clr.predict(z) |
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(14,6))
df[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
<ipython-input-73-4cda8b1055aa>:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
ax.grid(b=True, which="major")
<ipython-input-73-4cda8b1055aa>:8: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
ax.grid(b=True, which="minor");
Il manque à ce comparatif le GPU mais c’est un peu plus complexe à mettre en oeuvre, il faut une carte GPU et la parallélisation n’apporterait pas énormément compte tenu de la faible dimension du problème.
Prédiction one-off et biais de mesure#
Le graphique précédent montre que la fonction predict
de
scikit-learn est la plus lente. La première raison est que ce code est
valable pour toutes les régresssions linéaires alors que toutes les
autres fonctions sont spécialisées pour un seul modèle. La seconde
raison est que le code de scikit-learn est optimisé pour le calcul de
plusieurs prédictions à la fois alors que toutes les autres fonctions
n’en calcule qu’une seule (scénario dit one-off). On compare à ce que
donnerait unev version purement python et numpy.
def predict_clr_python_loop_multi(x, coef, intercept):
# On s'attend à deux dimension.
res = numpy.zeros((x.shape[0], 1))
res[:, 0] = intercept
for i in range(0, x.shape[0]):
res[i, 0] += sum(a*b for a, b in zip(x[i, :], coef))
return res
predict_clr_python_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[214.72477745],
[175.29091463]])
def predict_clr_numpy_loop_multi(x, coef, intercept):
# On s'attend à deux dimension.
res = numpy.ones((x.shape[0], 1)) * intercept
res += x @ coef.reshape((len(coef), 1))
return res
predict_clr_numpy_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[214.72477745],
[175.29091463]])
def predict_clr_numba_cast_multi(X, coef, intercept):
return [predict_clr_numba_cast(x, coef, intercept) for x in X]
predict_clr_numba_cast_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[214.724777447606, 175.29091463098356]
def predict_clr_cython_type_multi(X, coef, intercept):
return [predict_clr_cython_type(x, coef, intercept) for x in X]
predict_clr_cython_type_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[214.724777447606, 175.29091463098356]
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
500000, 600000]
number = 10
for i in batch:
if i <= diabetes_X_test.shape[0]:
mx = diabetes_X_test[:i]
else:
mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
mx = numpy.vstack(mxs)
mx = mx[:i]
print("batch", "=", i)
repeat=20 if i >= 5000 else 100
memo.append(timeexe("sklearn.predict %d" % i, "clr.predict(mx)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "sklearn"
if i <= 1000:
# très lent
memo.append(timeexe("python %d" % i, "predict_clr_python_loop_multi(mx, clr.coef_, clr.intercept_)",
repeat=20, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "python"
memo.append(timeexe("numpy %d" % i, "predict_clr_numpy_loop_multi(mx, clr.coef_, clr.intercept_)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "numpy"
if i <= 10000:
# très lent
memo.append(timeexe("numba %d" % i, "predict_clr_numba_cast_multi(mx, clr.coef_, clr.intercept_)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "numba"
if i <= 1000:
# très lent
memo.append(timeexe("cython %d" % i, "predict_clr_cython_type_multi(mx, clr.coef_, clr.intercept_)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "cython"
if ok_onnx:
memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt(mx.astype(numpy.float32))",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 62.45 µs Ecart-type 33.39 µs (with 10 runs) in [41.71 µs, 119.15 µs]
Moyenne: 6.43 µs Ecart-type 496.70 ns (with 10 runs) in [6.26 µs, 8.59 µs]
Moyenne: 12.10 µs Ecart-type 8.26 µs (with 10 runs) in [8.20 µs, 26.70 µs]
Moyenne: 1.99 µs Ecart-type 259.55 ns (with 10 runs) in [1.95 µs, 1.99 µs]
Moyenne: 1.61 µs Ecart-type 103.91 ns (with 10 runs) in [1.59 µs, 1.63 µs]
Moyenne: 19.38 µs Ecart-type 11.88 µs (with 10 runs) in [13.14 µs, 43.26 µs]
batch = 10
Moyenne: 73.96 µs Ecart-type 41.93 µs (with 10 runs) in [43.50 µs, 116.26 µs]
Moyenne: 101.06 µs Ecart-type 3.73 µs (with 10 runs) in [98.16 µs, 111.41 µs]
Moyenne: 19.06 µs Ecart-type 31.16 µs (with 10 runs) in [11.86 µs, 25.57 µs]
Moyenne: 10.84 µs Ecart-type 5.26 µs (with 10 runs) in [8.34 µs, 22.70 µs]
Moyenne: 10.46 µs Ecart-type 5.49 µs (with 10 runs) in [5.69 µs, 20.84 µs]
Moyenne: 19.66 µs Ecart-type 25.05 µs (with 10 runs) in [12.23 µs, 34.34 µs]
batch = 100
Moyenne: 68.65 µs Ecart-type 26.04 µs (with 10 runs) in [46.99 µs, 119.00 µs]
Moyenne: 740.30 µs Ecart-type 156.38 µs (with 10 runs) in [512.14 µs, 1.02 ms]
Moyenne: 10.75 µs Ecart-type 3.89 µs (with 10 runs) in [8.72 µs, 16.79 µs]
Moyenne: 94.32 µs Ecart-type 14.42 µs (with 10 runs) in [72.11 µs, 124.00 µs]
Moyenne: 67.23 µs Ecart-type 31.05 µs (with 10 runs) in [43.72 µs, 135.94 µs]
Moyenne: 91.28 µs Ecart-type 164.49 µs (with 10 runs) in [15.53 µs, 481.48 µs]
batch = 200
Moyenne: 68.82 µs Ecart-type 38.95 µs (with 10 runs) in [46.58 µs, 152.87 µs]
Moyenne: 1.59 ms Ecart-type 497.08 µs (with 10 runs) in [1.09 ms, 2.98 ms]
Moyenne: 11.66 µs Ecart-type 2.01 µs (with 10 runs) in [9.79 µs, 16.71 µs]
Moyenne: 167.67 µs Ecart-type 37.37 µs (with 10 runs) in [133.64 µs, 240.53 µs]
Moyenne: 102.09 µs Ecart-type 25.18 µs (with 10 runs) in [86.07 µs, 162.09 µs]
Moyenne: 18.04 µs Ecart-type 8.31 µs (with 10 runs) in [15.00 µs, 34.57 µs]
batch = 500
Moyenne: 63.53 µs Ecart-type 20.92 µs (with 10 runs) in [50.94 µs, 116.69 µs]
Moyenne: 3.22 ms Ecart-type 296.30 µs (with 10 runs) in [2.84 ms, 3.80 ms]
Moyenne: 13.91 µs Ecart-type 4.58 µs (with 10 runs) in [11.80 µs, 26.71 µs]
Moyenne: 410.88 µs Ecart-type 73.68 µs (with 10 runs) in [333.06 µs, 523.19 µs]
Moyenne: 263.08 µs Ecart-type 117.22 µs (with 10 runs) in [211.75 µs, 444.83 µs]
Moyenne: 22.28 µs Ecart-type 12.93 µs (with 10 runs) in [19.16 µs, 37.56 µs]
batch = 1000
Moyenne: 153.47 µs Ecart-type 43.85 µs (with 10 runs) in [125.94 µs, 229.51 µs]
Moyenne: 5.52 ms Ecart-type 389.98 µs (with 10 runs) in [4.99 ms, 6.18 ms]
Moyenne: 83.03 µs Ecart-type 22.52 µs (with 10 runs) in [73.67 µs, 95.91 µs]
Moyenne: 702.77 µs Ecart-type 76.26 µs (with 10 runs) in [661.35 µs, 888.11 µs]
Moyenne: 445.87 µs Ecart-type 53.38 µs (with 10 runs) in [420.78 µs, 548.59 µs]
Moyenne: 27.48 µs Ecart-type 6.31 µs (with 10 runs) in [26.53 µs, 29.87 µs]
batch = 2000
Moyenne: 147.73 µs Ecart-type 19.47 µs (with 10 runs) in [132.10 µs, 187.47 µs]
Moyenne: 83.71 µs Ecart-type 4.68 µs (with 10 runs) in [79.25 µs, 93.14 µs]
Moyenne: 1.58 ms Ecart-type 216.25 µs (with 10 runs) in [1.32 ms, 1.97 ms]
Moyenne: 47.31 µs Ecart-type 20.65 µs (with 10 runs) in [37.00 µs, 97.03 µs]
batch = 3000
Moyenne: 179.79 µs Ecart-type 45.35 µs (with 10 runs) in [144.02 µs, 310.74 µs]
Moyenne: 92.27 µs Ecart-type 7.05 µs (with 10 runs) in [84.71 µs, 106.64 µs]
Moyenne: 2.37 ms Ecart-type 267.39 µs (with 10 runs) in [1.99 ms, 2.91 ms]
Moyenne: 50.69 µs Ecart-type 6.32 µs (with 10 runs) in [48.65 µs, 52.22 µs]
batch = 4000
Moyenne: 193.02 µs Ecart-type 28.74 µs (with 10 runs) in [173.71 µs, 211.52 µs]
Moyenne: 100.06 µs Ecart-type 22.27 µs (with 10 runs) in [85.61 µs, 133.38 µs]
Moyenne: 3.13 ms Ecart-type 296.38 µs (with 10 runs) in [2.73 ms, 3.54 ms]
Moyenne: 64.67 µs Ecart-type 7.43 µs (with 10 runs) in [59.90 µs, 68.08 µs]
batch = 5000
Moyenne: 215.06 µs Ecart-type 46.52 µs (with 10 runs) in [196.06 µs, 411.19 µs]
Moyenne: 110.91 µs Ecart-type 8.08 µs (with 10 runs) in [90.36 µs, 122.94 µs]
Moyenne: 3.49 ms Ecart-type 212.13 µs (with 10 runs) in [3.30 ms, 4.04 ms]
Moyenne: 78.86 µs Ecart-type 5.47 µs (with 10 runs) in [77.15 µs, 102.21 µs]
batch = 10000
Moyenne: 248.75 µs Ecart-type 64.14 µs (with 10 runs) in [192.57 µs, 425.01 µs]
Moyenne: 116.55 µs Ecart-type 17.05 µs (with 10 runs) in [100.13 µs, 152.60 µs]
Moyenne: 7.18 ms Ecart-type 420.77 µs (with 10 runs) in [6.62 ms, 8.15 ms]
Moyenne: 153.30 µs Ecart-type 13.69 µs (with 10 runs) in [149.03 µs, 211.69 µs]
batch = 20000
Moyenne: 293.81 µs Ecart-type 19.49 µs (with 10 runs) in [283.46 µs, 364.31 µs]
Moyenne: 147.12 µs Ecart-type 8.23 µs (with 10 runs) in [135.43 µs, 160.67 µs]
Moyenne: 215.69 µs Ecart-type 14.46 µs (with 10 runs) in [204.68 µs, 262.99 µs]
batch = 50000
Moyenne: 1.00 ms Ecart-type 44.28 µs (with 10 runs) in [967.01 µs, 1.13 ms]
Moyenne: 503.33 µs Ecart-type 13.21 µs (with 10 runs) in [491.66 µs, 551.81 µs]
Moyenne: 1.86 ms Ecart-type 1.14 ms (with 10 runs) in [1.13 ms, 4.90 ms]
batch = 75000
Moyenne: 1.75 ms Ecart-type 153.53 µs (with 10 runs) in [1.56 ms, 1.94 ms]
Moyenne: 663.38 µs Ecart-type 20.47 µs (with 10 runs) in [630.15 µs, 700.62 µs]
Moyenne: 1.88 ms Ecart-type 173.99 µs (with 10 runs) in [1.65 ms, 2.14 ms]
batch = 100000
Moyenne: 2.56 ms Ecart-type 204.42 µs (with 10 runs) in [2.27 ms, 2.85 ms]
Moyenne: 1.21 ms Ecart-type 113.75 µs (with 10 runs) in [1.04 ms, 1.44 ms]
Moyenne: 2.98 ms Ecart-type 934.23 µs (with 10 runs) in [2.22 ms, 6.31 ms]
batch = 150000
Moyenne: 4.00 ms Ecart-type 188.08 µs (with 10 runs) in [3.78 ms, 4.46 ms]
Moyenne: 2.92 ms Ecart-type 344.26 µs (with 10 runs) in [2.54 ms, 3.93 ms]
Moyenne: 3.76 ms Ecart-type 308.56 µs (with 10 runs) in [3.26 ms, 4.52 ms]
batch = 200000
Moyenne: 5.73 ms Ecart-type 424.36 µs (with 10 runs) in [5.17 ms, 6.72 ms]
Moyenne: 4.00 ms Ecart-type 606.67 µs (with 10 runs) in [3.50 ms, 6.04 ms]
Moyenne: 5.44 ms Ecart-type 742.52 µs (with 10 runs) in [4.57 ms, 7.38 ms]
batch = 300000
Moyenne: 8.36 ms Ecart-type 1.26 ms (with 10 runs) in [7.78 ms, 13.52 ms]
Moyenne: 5.37 ms Ecart-type 352.34 µs (with 10 runs) in [5.08 ms, 6.64 ms]
Moyenne: 7.18 ms Ecart-type 680.24 µs (with 10 runs) in [6.69 ms, 8.83 ms]
batch = 400000
Moyenne: 11.49 ms Ecart-type 1.16 ms (with 10 runs) in [10.36 ms, 15.15 ms]
Moyenne: 7.87 ms Ecart-type 709.04 µs (with 10 runs) in [7.18 ms, 9.70 ms]
Moyenne: 10.51 ms Ecart-type 900.27 µs (with 10 runs) in [9.41 ms, 13.22 ms]
batch = 500000
Moyenne: 15.01 ms Ecart-type 1.90 ms (with 10 runs) in [12.99 ms, 20.81 ms]
Moyenne: 11.02 ms Ecart-type 889.69 µs (with 10 runs) in [9.64 ms, 13.29 ms]
Moyenne: 17.02 ms Ecart-type 2.13 ms (with 10 runs) in [14.72 ms, 22.19 ms]
batch = 600000
Moyenne: 21.19 ms Ecart-type 1.93 ms (with 10 runs) in [18.32 ms, 26.29 ms]
Moyenne: 12.47 ms Ecart-type 964.03 µs (with 10 runs) in [11.00 ms, 14.31 ms]
Moyenne: 18.04 ms Ecart-type 2.80 ms (with 10 runs) in [13.37 ms, 24.63 ms]
dfb = pandas.DataFrame(memo)[["average", "lib", "batch"]]
piv = dfb.pivot("batch", "lib", "average")
piv
lib | cython | numba | numpy | onnxruntime | python | sklearn |
---|---|---|---|---|---|---|
batch | ||||||
1 | 0.000002 | 0.000002 | 0.000012 | 0.000019 | 0.000006 | 0.000062 |
10 | 0.000010 | 0.000011 | 0.000019 | 0.000020 | 0.000101 | 0.000074 |
100 | 0.000067 | 0.000094 | 0.000011 | 0.000091 | 0.000740 | 0.000069 |
200 | 0.000102 | 0.000168 | 0.000012 | 0.000018 | 0.001590 | 0.000069 |
500 | 0.000263 | 0.000411 | 0.000014 | 0.000022 | 0.003225 | 0.000064 |
1000 | 0.000446 | 0.000703 | 0.000083 | 0.000027 | 0.005516 | 0.000153 |
2000 | NaN | 0.001580 | 0.000084 | 0.000047 | NaN | 0.000148 |
3000 | NaN | 0.002371 | 0.000092 | 0.000051 | NaN | 0.000180 |
4000 | NaN | 0.003125 | 0.000100 | 0.000065 | NaN | 0.000193 |
5000 | NaN | 0.003490 | 0.000111 | 0.000079 | NaN | 0.000215 |
10000 | NaN | 0.007181 | 0.000117 | 0.000153 | NaN | 0.000249 |
20000 | NaN | NaN | 0.000147 | 0.000216 | NaN | 0.000294 |
50000 | NaN | NaN | 0.000503 | 0.001863 | NaN | 0.001000 |
75000 | NaN | NaN | 0.000663 | 0.001879 | NaN | 0.001749 |
100000 | NaN | NaN | 0.001209 | 0.002980 | NaN | 0.002557 |
150000 | NaN | NaN | 0.002923 | 0.003762 | NaN | 0.004001 |
200000 | NaN | NaN | 0.004001 | 0.005440 | NaN | 0.005731 |
300000 | NaN | NaN | 0.005366 | 0.007180 | NaN | 0.008365 |
400000 | NaN | NaN | 0.007872 | 0.010510 | NaN | 0.011489 |
500000 | NaN | NaN | 0.011016 | 0.017021 | NaN | 0.015013 |
600000 | NaN | NaN | 0.012468 | 0.018040 | NaN | 0.021193 |
for c in piv.columns:
piv["ave_" + c] = piv[c] / piv.index
piv
lib | cython | numba | numpy | onnxruntime | python | sklearn | ave_cython | ave_numba | ave_numpy | ave_onnxruntime | ave_python | ave_sklearn |
---|---|---|---|---|---|---|---|---|---|---|---|---|
batch | ||||||||||||
1 | 0.000002 | 0.000002 | 0.000012 | 0.000019 | 0.000006 | 0.000062 | 1.614900e-06 | 1.990900e-06 | 1.209790e-05 | 1.938460e-05 | 0.000006 | 6.245340e-05 |
10 | 0.000010 | 0.000011 | 0.000019 | 0.000020 | 0.000101 | 0.000074 | 1.046460e-06 | 1.084320e-06 | 1.906010e-06 | 1.965810e-06 | 0.000010 | 7.396440e-06 |
100 | 0.000067 | 0.000094 | 0.000011 | 0.000091 | 0.000740 | 0.000069 | 6.722760e-07 | 9.431990e-07 | 1.075410e-07 | 9.127790e-07 | 0.000007 | 6.865190e-07 |
200 | 0.000102 | 0.000168 | 0.000012 | 0.000018 | 0.001590 | 0.000069 | 5.104525e-07 | 8.383455e-07 | 5.827850e-08 | 9.019900e-08 | 0.000008 | 3.440995e-07 |
500 | 0.000263 | 0.000411 | 0.000014 | 0.000022 | 0.003225 | 0.000064 | 5.261610e-07 | 8.217592e-07 | 2.781740e-08 | 4.455220e-08 | 0.000006 | 1.270610e-07 |
1000 | 0.000446 | 0.000703 | 0.000083 | 0.000027 | 0.005516 | 0.000153 | 4.458687e-07 | 7.027674e-07 | 8.303090e-08 | 2.747640e-08 | 0.000006 | 1.534708e-07 |
2000 | NaN | 0.001580 | 0.000084 | 0.000047 | NaN | 0.000148 | NaN | 7.899395e-07 | 4.185515e-08 | 2.365645e-08 | NaN | 7.386540e-08 |
3000 | NaN | 0.002371 | 0.000092 | 0.000051 | NaN | 0.000180 | NaN | 7.902492e-07 | 3.075760e-08 | 1.689707e-08 | NaN | 5.992867e-08 |
4000 | NaN | 0.003125 | 0.000100 | 0.000065 | NaN | 0.000193 | NaN | 7.813673e-07 | 2.501480e-08 | 1.616818e-08 | NaN | 4.825388e-08 |
5000 | NaN | 0.003490 | 0.000111 | 0.000079 | NaN | 0.000215 | NaN | 6.979748e-07 | 2.218220e-08 | 1.577170e-08 | NaN | 4.301210e-08 |
10000 | NaN | 0.007181 | 0.000117 | 0.000153 | NaN | 0.000249 | NaN | 7.180820e-07 | 1.165535e-08 | 1.533050e-08 | NaN | 2.487490e-08 |
20000 | NaN | NaN | 0.000147 | 0.000216 | NaN | 0.000294 | NaN | NaN | 7.356025e-09 | 1.078465e-08 | NaN | 1.469057e-08 |
50000 | NaN | NaN | 0.000503 | 0.001863 | NaN | 0.001000 | NaN | NaN | 1.006655e-08 | 3.725768e-08 | NaN | 2.000188e-08 |
75000 | NaN | NaN | 0.000663 | 0.001879 | NaN | 0.001749 | NaN | NaN | 8.845087e-09 | 2.505991e-08 | NaN | 2.331396e-08 |
100000 | NaN | NaN | 0.001209 | 0.002980 | NaN | 0.002557 | NaN | NaN | 1.208690e-08 | 2.980086e-08 | NaN | 2.556766e-08 |
150000 | NaN | NaN | 0.002923 | 0.003762 | NaN | 0.004001 | NaN | NaN | 1.948814e-08 | 2.508106e-08 | NaN | 2.667062e-08 |
200000 | NaN | NaN | 0.004001 | 0.005440 | NaN | 0.005731 | NaN | NaN | 2.000416e-08 | 2.720136e-08 | NaN | 2.865267e-08 |
300000 | NaN | NaN | 0.005366 | 0.007180 | NaN | 0.008365 | NaN | NaN | 1.788538e-08 | 2.393301e-08 | NaN | 2.788189e-08 |
400000 | NaN | NaN | 0.007872 | 0.010510 | NaN | 0.011489 | NaN | NaN | 1.967972e-08 | 2.627497e-08 | NaN | 2.872169e-08 |
500000 | NaN | NaN | 0.011016 | 0.017021 | NaN | 0.015013 | NaN | NaN | 2.203297e-08 | 3.404131e-08 | NaN | 3.002589e-08 |
600000 | NaN | NaN | 0.012468 | 0.018040 | NaN | 0.021193 | NaN | NaN | 2.077927e-08 | 3.006664e-08 | NaN | 3.532122e-08 |
libs = list(c for c in piv.columns if "ave_" in c)
ax = piv.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch")
ax.grid(True);
Le minimum obtenu est pour soit 10 ns. Cela montre que la comparaisson précédente était incomplète voire biaisée. Tout dépend de l’usage qu’on fait de la fonction de prédiction même s’il sera toujours possible de d’écrire un code spécialisé plus rapide que toute autre fonction générique. En général, plus on reste du côté Python, plus le programme est lent. Le nombre de passage de l’un à l’autre, selon la façon dont il est fait ralenti aussi. En tenant compte de cela, le programme rouge sera plus lent que le vert.
from pyquickhelper.helpgen import NbImage
NbImage("pycpp.png")
Ces résultats sont d’une façon générale assez volatile car le temps de calcul est enrobé dans plusieurs fonctions Python qui rendent une mesure précise difficile. Il reste néanmoins une bonne idée des ordres de grandeurs.
Random Forest#
On reproduit les mêmes résultats pour une random forest mais la réécriture n’est plus aussi simple qu’une régression linéaire.
Une prédiction à la fois#
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(diabetes_X_train, diabetes_y_train)
RandomForestRegressor(n_estimators=10)
memo_time = []
x = diabetes_X_test[:1]
memo_time.append(timeexe("sklearn-rf", "rf.predict(x)", repeat=100, number=20))
Moyenne: 980.23 µs Ecart-type 60.93 µs (with 20 runs) in [937.55 µs, 1.11 ms]
C’est beaucoup plus long que la régression linéaire. On essaye avec onnx.
if ok_onnx:
onnxrf_model = convert_sklearn(
rf, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))],
target_opset=11)
onnxrf_model.ir_version = 6
save_model(onnxrf_model, 'model_rf.onnx')
model_onnx = onnx.load('model_rf.onnx')
if ok_onnx:
sess = onnxruntime.InferenceSession("model_rf.onnx")
for i in sess.get_inputs():
print('Input:', i)
for o in sess.get_outputs():
print('Output:', o)
def predict_onnxrt_rf(x):
return sess.run(["variable"], {'input': x})
print(predict_onnxrt_rf(x.astype(numpy.float32)))
memo_time.append(timeexe("onnx-rf", "predict_onnxrt_rf(x.astype(numpy.float32))",
repeat=100, number=20))
Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10])
Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1])
[array([[243.00002]], dtype=float32)]
Moyenne: 14.36 µs Ecart-type 4.18 µs (with 20 runs) in [11.75 µs, 22.22 µs]
C’est beaucoup plus rapide.
import pandas
df2 = pandas.DataFrame(data=memo_time)
df2 = df2.set_index("legend").sort_values("average")
df2
average | deviation | first | first3 | last3 | repeat | min5 | max5 | code | run | |
---|---|---|---|---|---|---|---|---|---|---|
legend | ||||||||||
onnx-rf | 0.000014 | 0.000004 | 0.000047 | 0.000027 | 0.000014 | 100 | 0.000012 | 0.000022 | predict_onnxrt_rf(x.astype(numpy.float32)) | 20 |
sklearn-rf | 0.000980 | 0.000061 | 0.001308 | 0.001087 | 0.001075 | 100 | 0.000938 | 0.001106 | rf.predict(x) | 20 |
fig, ax = plt.subplots(1, 1, figsize=(14,4))
df2[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
<ipython-input-89-8d560b1fbb3b>:5: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
ax.grid(b=True, which="major")
<ipython-input-89-8d560b1fbb3b>:6: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
ax.grid(b=True, which="minor");
Prédiction en batch#
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
500000, 600000]
number = 10
repeat = 10
for i in batch[:15]:
if i <= diabetes_X_test.shape[0]:
mx = diabetes_X_test[:i]
else:
mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
mx = numpy.vstack(mxs)
mx = mx[:i]
print("batch", "=", i)
memo.append(timeexe("sklearn.predict %d" % i, "rf.predict(mx)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "sklearn"
if ok_onnx:
memo.append(timeexe("onnxruntime %d" % i,
"predict_onnxrt_rf(mx.astype(numpy.float32))",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "onnxruntime"
batch = 1
Moyenne: 1.11 ms Ecart-type 145.19 µs (with 10 runs) in [1.03 ms, 1.54 ms]
Moyenne: 15.70 µs Ecart-type 13.36 µs (with 10 runs) in [11.20 µs, 55.77 µs]
batch = 10
Moyenne: 1.14 ms Ecart-type 162.36 µs (with 10 runs) in [952.57 µs, 1.51 ms]
Moyenne: 25.55 µs Ecart-type 9.43 µs (with 10 runs) in [17.37 µs, 42.15 µs]
batch = 100
Moyenne: 1.09 ms Ecart-type 80.51 µs (with 10 runs) in [1.01 ms, 1.31 ms]
Moyenne: 38.04 µs Ecart-type 17.20 µs (with 10 runs) in [32.02 µs, 89.62 µs]
batch = 200
Moyenne: 1.42 ms Ecart-type 126.30 µs (with 10 runs) in [1.15 ms, 1.71 ms]
Moyenne: 82.17 µs Ecart-type 56.27 µs (with 10 runs) in [43.86 µs, 213.17 µs]
batch = 500
Moyenne: 1.79 ms Ecart-type 543.34 µs (with 10 runs) in [1.31 ms, 3.18 ms]
Moyenne: 130.31 µs Ecart-type 30.45 µs (with 10 runs) in [85.15 µs, 190.08 µs]
batch = 1000
Moyenne: 1.53 ms Ecart-type 93.12 µs (with 10 runs) in [1.42 ms, 1.70 ms]
Moyenne: 249.60 µs Ecart-type 23.96 µs (with 10 runs) in [232.24 µs, 312.27 µs]
batch = 2000
Moyenne: 2.09 ms Ecart-type 149.23 µs (with 10 runs) in [1.89 ms, 2.33 ms]
Moyenne: 393.37 µs Ecart-type 165.01 µs (with 10 runs) in [283.40 µs, 734.87 µs]
batch = 3000
Moyenne: 2.77 ms Ecart-type 921.32 µs (with 10 runs) in [2.24 ms, 5.40 ms]
Moyenne: 432.57 µs Ecart-type 16.08 µs (with 10 runs) in [422.71 µs, 479.76 µs]
batch = 4000
Moyenne: 2.96 ms Ecart-type 331.99 µs (with 10 runs) in [2.63 ms, 3.69 ms]
Moyenne: 1.04 ms Ecart-type 485.53 µs (with 10 runs) in [598.92 µs, 2.38 ms]
batch = 5000
Moyenne: 3.27 ms Ecart-type 348.48 µs (with 10 runs) in [3.00 ms, 4.16 ms]
Moyenne: 996.95 µs Ecart-type 207.84 µs (with 10 runs) in [767.12 µs, 1.47 ms]
batch = 10000
Moyenne: 5.26 ms Ecart-type 404.81 µs (with 10 runs) in [4.96 ms, 6.34 ms]
Moyenne: 1.75 ms Ecart-type 317.18 µs (with 10 runs) in [1.34 ms, 2.13 ms]
batch = 20000
Moyenne: 10.52 ms Ecart-type 1.11 ms (with 10 runs) in [9.21 ms, 13.42 ms]
Moyenne: 4.40 ms Ecart-type 522.54 µs (with 10 runs) in [3.52 ms, 5.43 ms]
batch = 50000
Moyenne: 24.33 ms Ecart-type 2.90 ms (with 10 runs) in [21.27 ms, 29.83 ms]
Moyenne: 8.21 ms Ecart-type 1.31 ms (with 10 runs) in [7.32 ms, 11.74 ms]
batch = 75000
Moyenne: 31.54 ms Ecart-type 251.81 µs (with 10 runs) in [31.19 ms, 32.06 ms]
Moyenne: 12.22 ms Ecart-type 908.21 µs (with 10 runs) in [11.38 ms, 14.24 ms]
batch = 100000
Moyenne: 42.05 ms Ecart-type 745.44 µs (with 10 runs) in [41.22 ms, 43.35 ms]
Moyenne: 16.17 ms Ecart-type 1.01 ms (with 10 runs) in [14.98 ms, 17.54 ms]
dfbrf = pandas.DataFrame(memo)[["average", "lib", "batch"]]
pivrf = dfbrf.pivot("batch", "lib", "average")
for c in pivrf.columns:
pivrf["ave_" + c] = pivrf[c] / pivrf.index
libs = list(c for c in pivrf.columns if "ave_" in c)
ax = pivrf.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch\nrandom forest")
ax.grid(True);