Note
Click here to download the full example code
Benchmark Random Forests, Tree Ensemble, (AoS and SoA)#
The script compares different implementations for the operator TreeEnsembleRegressor.
baseline: RandomForestRegressor from scikit-learn
ort: onnxruntime,
mlprodict: an implementation based on an array of structures, every structure describes a node,
mlprodict2 similar implementation but instead of having an array of structures, it relies on a structure of arrays, it parallelizes by blocks of 128 observations and inside every block, goes through trees then through observations (double loop),
mlprodict3: parallelizes by trees, this implementation is faster when the depth is higher than 10.
A structure of arrays has better performance: Case study: Comparing Arrays of Structures and Structures of Arrays Data Layouts for a Compute-Intensive Loop. See also AoS and SoA.
Profile the execution
py-spy can be used to profile the execution of a program. The profile is more informative if the code is compiled with debug information.
py-spy record --native -r 10 -o plot_random_forest_reg.svg -- python plot_random_forest_reg.py
Import#
import warnings
from time import perf_counter as time
from multiprocessing import cpu_count
import numpy
from numpy.random import rand
from numpy.testing import assert_almost_equal
import pandas
import matplotlib.pyplot as plt
from sklearn import config_context
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils._testing import ignore_warnings
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession
from mlprodict.onnxrt import OnnxInference
Available optimisation on this machine.
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())
Out:
AVX-omp=8
Versions#
def version():
from datetime import datetime
import sklearn
import numpy
import onnx
import onnxruntime
import skl2onnx
import mlprodict
df = pandas.DataFrame([
{"name": "date", "version": str(datetime.now())},
{"name": "numpy", "version": numpy.__version__},
{"name": "scikit-learn", "version": sklearn.__version__},
{"name": "onnx", "version": onnx.__version__},
{"name": "onnxruntime", "version": onnxruntime.__version__},
{"name": "skl2onnx", "version": skl2onnx.__version__},
{"name": "mlprodict", "version": mlprodict.__version__},
])
return df
version()
Implementations to benchmark#
def fcts_model(X, y, max_depth, n_estimators, n_jobs):
"RandomForestClassifier."
rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators,
n_jobs=n_jobs)
rf.fit(X, y)
initial_types = [('X', FloatTensorType([None, X.shape[1]]))]
onx = convert_sklearn(rf, initial_types=initial_types)
sess = InferenceSession(onx.SerializeToString())
outputs = [o.name for o in sess.get_outputs()]
oinf = OnnxInference(onx, runtime="python")
oinf.sequence_[0].ops_._init(numpy.float32, 1)
name = outputs[0]
oinf2 = OnnxInference(onx, runtime="python")
oinf2.sequence_[0].ops_._init(numpy.float32, 2)
oinf3 = OnnxInference(onx, runtime="python")
oinf3.sequence_[0].ops_._init(numpy.float32, 3)
def predict_skl_predict(X, model=rf):
return rf.predict(X)
def predict_onnxrt_predict(X, sess=sess):
return sess.run(outputs[:1], {'X': X})[0]
def predict_onnx_inference(X, oinf=oinf):
return oinf.run({'X': X})[name]
def predict_onnx_inference2(X, oinf2=oinf2):
return oinf2.run({'X': X})[name]
def predict_onnx_inference3(X, oinf3=oinf3):
return oinf3.run({'X': X})[name]
return {'predict': (
predict_skl_predict, predict_onnxrt_predict,
predict_onnx_inference, predict_onnx_inference2,
predict_onnx_inference3)}
Benchmarks#
def allow_configuration(**kwargs):
return True
def bench(n_obs, n_features, max_depths, n_estimatorss, n_jobss,
methods, repeat=10, verbose=False):
res = []
for nfeat in n_features:
ntrain = 50000
X_train = numpy.empty((ntrain, nfeat)).astype(numpy.float32)
X_train[:, :] = rand(ntrain, nfeat)[:, :]
eps = rand(ntrain) - 0.5
y_train = X_train.sum(axis=1) + eps
for n_jobs in n_jobss:
for max_depth in max_depths:
for n_estimators in n_estimatorss:
fcts = fcts_model(X_train, y_train,
max_depth, n_estimators, n_jobs)
for n in n_obs:
for method in methods:
fct1, fct2, fct3, fct4, fct5 = fcts[method]
if not allow_configuration(
n=n, nfeat=nfeat, max_depth=max_depth,
n_estimator=n_estimators, n_jobs=n_jobs,
method=method):
continue
obs = dict(n_obs=n, nfeat=nfeat,
max_depth=max_depth,
n_estimators=n_estimators,
method=method,
n_jobs=n_jobs)
# creates different inputs to avoid caching
Xs = []
for r in range(repeat):
x = numpy.empty((n, nfeat))
x[:, :] = rand(n, nfeat)[:, :]
Xs.append(x.astype(numpy.float32))
# measures the baseline
with config_context(assume_finite=True):
st = time()
repeated = 0
for X in Xs:
p1 = fct1(X)
repeated += 1
if time() - st >= 1:
break # stops if longer than a second
end = time()
obs["time_skl"] = (end - st) / repeated
# measures the new implementation
st = time()
r2 = 0
for X in Xs:
p2 = fct2(X)
r2 += 1
if r2 >= repeated:
break
end = time()
obs["time_ort"] = (end - st) / r2
# measures the other new implementation
st = time()
r2 = 0
for X in Xs:
p2 = fct3(X)
r2 += 1
if r2 >= repeated:
break
end = time()
obs["time_mlprodict"] = (end - st) / r2
# measures the other new implementation 2
st = time()
r2 = 0
for X in Xs:
p2 = fct4(X)
r2 += 1
if r2 >= repeated:
break
end = time()
obs["time_mlprodict2"] = (end - st) / r2
# measures the other new implementation 3
st = time()
r2 = 0
for X in Xs:
p2 = fct5(X)
r2 += 1
if r2 >= repeated:
break
end = time()
obs["time_mlprodict3"] = (end - st) / r2
# final
res.append(obs)
if verbose and (len(res) % 1 == 0 or n >= 10000):
print("bench", len(res), ":", obs)
# checks that both produce the same outputs
if n <= 10000:
if len(p1.shape) == 1 and len(p2.shape) == 2:
p2 = p2.ravel()
try:
assert_almost_equal(
p1.ravel(), p2.ravel(), decimal=5)
except AssertionError as e:
warnings.warn(str(e))
return res
Graphs#
def plot_rf_models(dfr):
def autolabel(ax, rects):
for rect in rects:
height = rect.get_height()
ax.annotate('%1.1fx' % height,
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom',
fontsize=8)
engines = [_.split('_')[-1] for _ in dfr.columns if _.startswith("time_")]
engines = [_ for _ in engines if _ != 'skl']
for engine in engines:
dfr["speedup_%s" % engine] = dfr["time_skl"] / dfr["time_%s" % engine]
print(dfr.tail().T)
ncols = 4
fig, axs = plt.subplots(len(engines), ncols, figsize=(
14, 4 * len(engines)), sharey=True)
row = 0
for row, engine in enumerate(engines):
pos = 0
name = "RandomForestRegressor - %s" % engine
for max_depth in sorted(set(dfr.max_depth)):
for nf in sorted(set(dfr.nfeat)):
for est in sorted(set(dfr.n_estimators)):
for n_jobs in sorted(set(dfr.n_jobs)):
sub = dfr[(dfr.max_depth == max_depth) &
(dfr.nfeat == nf) &
(dfr.n_estimators == est) &
(dfr.n_jobs == n_jobs)]
ax = axs[row, pos]
labels = sub.n_obs
means = sub["speedup_%s" % engine]
x = numpy.arange(len(labels))
width = 0.90
rects1 = ax.bar(x, means, width, label='Speedup')
if pos == 0:
ax.set_yscale('log')
ax.set_ylim([0.1, max(dfr["speedup_%s" % engine])])
if pos == 0:
ax.set_ylabel('Speedup')
ax.set_title(
'%s\ndepth %d - %d features\n %d estimators %d '
'jobs' % (name, max_depth, nf, est, n_jobs))
if row == len(engines) - 1:
ax.set_xlabel('batch size')
ax.set_xticks(x)
ax.set_xticklabels(labels)
autolabel(ax, rects1)
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(8)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(8)
pos += 1
fig.tight_layout()
return fig, ax
Run benchs#
@ignore_warnings(category=FutureWarning)
def run_bench(repeat=100, verbose=False):
n_obs = [1, 10, 100, 1000, 10000]
methods = ['predict']
n_features = [30]
max_depths = [6, 8, 10, 12]
n_estimatorss = [100]
n_jobss = [cpu_count()]
start = time()
results = bench(n_obs, n_features, max_depths, n_estimatorss, n_jobss,
methods, repeat=repeat, verbose=verbose)
end = time()
results_df = pandas.DataFrame(results)
print("Total time = %0.3f sec cpu=%d\n" % (end - start, cpu_count()))
# plot the results
return results_df
name = "plot_random_forest_reg"
df = run_bench(verbose=True)
df.to_csv("%s.csv" % name, index=False)
df.to_excel("%s.xlsx" % name, index=False)
fig, ax = plot_rf_models(df)
fig.savefig("%s.png" % name)
plt.show()

Out:
bench 1 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.04247091762499622, 'time_ort': 0.00030502312498962664, 'time_mlprodict': 0.006159674083392019, 'time_mlprodict2': 5.8167333312061e-05, 'time_mlprodict3': 5.773645837810667e-05}
bench 2 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.04172783027985133, 'time_ort': 0.00032841620006365704, 'time_mlprodict': 0.0003075752401491627, 'time_mlprodict2': 0.00018000412004766985, 'time_mlprodict3': 0.011970902720058803}
bench 3 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.0475549444090575, 'time_ort': 0.0002476665454609743, 'time_mlprodict': 0.0028319744547347495, 'time_mlprodict2': 0.0018971141362271737, 'time_mlprodict3': 0.00036484986379615623}
bench 4 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.08368089746139819, 'time_ort': 0.001970179692412225, 'time_mlprodict': 0.0065751938460520114, 'time_mlprodict2': 0.00506770315405447, 'time_mlprodict3': 0.0011136170002058721}
bench 5 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.10642416900009266, 'time_ort': 0.016253938600129914, 'time_mlprodict': 0.04227641559991753, 'time_mlprodict2': 0.010400230000232113, 'time_mlprodict3': 0.010949246200470952}
bench 6 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.05661510194436738, 'time_ort': 0.0001276742223126348, 'time_mlprodict': 0.002630280166663902, 'time_mlprodict2': 6.272711147580089e-05, 'time_mlprodict3': 6.507322258484137e-05}
bench 7 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.057405688333447974, 'time_ort': 0.0007407862774723779, 'time_mlprodict': 0.000679729666444473, 'time_mlprodict2': 0.0006018959999588939, 'time_mlprodict3': 0.0001534934445872851}
bench 8 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.062035601764578396, 'time_ort': 0.001400822000184288, 'time_mlprodict': 0.0037061930000645052, 'time_mlprodict2': 0.0010432092941996148, 'time_mlprodict3': 0.00024100223527057096}
bench 9 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.08489201416705328, 'time_ort': 0.0029922674996972396, 'time_mlprodict': 0.011568191749878073, 'time_mlprodict2': 0.008338619082981799, 'time_mlprodict3': 0.0015622522502477902}
bench 10 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.1044214927002031, 'time_ort': 0.02298868330035475, 'time_mlprodict': 0.08813558239999111, 'time_mlprodict2': 0.01851972719960031, 'time_mlprodict3': 0.01439585280022584}
bench 11 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.05555401268410558, 'time_ort': 0.00026656147345324586, 'time_mlprodict': 0.002880609263037944, 'time_mlprodict2': 6.78451052456359e-05, 'time_mlprodict3': 6.679878922448934e-05}
bench 12 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.05839058594412765, 'time_ort': 0.0009655739999693146, 'time_mlprodict': 0.0008333724997808329, 'time_mlprodict2': 0.0009267726664903522, 'time_mlprodict3': 0.0005835795555968718}
bench 13 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.07036274013322934, 'time_ort': 0.0013764745332688715, 'time_mlprodict': 0.0043549924666876905, 'time_mlprodict2': 0.001703313799710789, 'time_mlprodict3': 0.0006701170665716442}
bench 14 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.08411604050039993, 'time_ort': 0.005212512999908843, 'time_mlprodict': 0.015390202333340616, 'time_mlprodict2': 0.01496612916707818, 'time_mlprodict3': 0.0024976035001600394}
bench 15 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.10452489270028309, 'time_ort': 0.032963693600322586, 'time_mlprodict': 0.12347187549967202, 'time_mlprodict2': 0.03526667479964089, 'time_mlprodict3': 0.019373706499754915}
bench 16 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.056836661944544176, 'time_ort': 0.00013444083338577507, 'time_mlprodict': 0.003001575166611777, 'time_mlprodict2': 7.363316682232026e-05, 'time_mlprodict3': 7.299644453774413e-05}
bench 17 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.060974535646999034, 'time_ort': 0.0011069374702535296, 'time_mlprodict': 0.001059137294171652, 'time_mlprodict2': 0.0012197786473895095, 'time_mlprodict3': 0.0008251197060042828}
bench 18 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.07253760514296508, 'time_ort': 0.001855488714292213, 'time_mlprodict': 0.005625236356926118, 'time_mlprodict2': 0.0022374512856393786, 'time_mlprodict3': 0.0011030407856326616}
bench 19 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.08763235925046804, 'time_ort': 0.00865990416665833, 'time_mlprodict': 0.018045492499974596, 'time_mlprodict2': 0.019916424917028053, 'time_mlprodict3': 0.004195665916995495}
bench 20 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.10984892990018125, 'time_ort': 0.06403504759946373, 'time_mlprodict': 0.16028350349952233, 'time_mlprodict2': 0.06378974019971792, 'time_mlprodict3': 0.02834181219950551}
Total time = 292.190 sec cpu=8
15 16 17 18 19
n_obs 1 10 100 1000 10000
nfeat 30 30 30 30 30
max_depth 12 12 12 12 12
n_estimators 100 100 100 100 100
method predict predict predict predict predict
n_jobs 8 8 8 8 8
time_skl 0.056837 0.060975 0.072538 0.087632 0.109849
time_ort 0.000134 0.001107 0.001855 0.00866 0.064035
time_mlprodict 0.003002 0.001059 0.005625 0.018045 0.160284
time_mlprodict2 0.000074 0.00122 0.002237 0.019916 0.06379
time_mlprodict3 0.000073 0.000825 0.001103 0.004196 0.028342
speedup_ort 422.763386 55.083993 39.093531 10.119322 1.71545
speedup_mlprodict 18.935612 57.570002 12.895032 4.856191 0.685341
speedup_mlprodict2 771.889413 49.988197 32.419747 4.400004 1.722047
speedup_mlprodict3 778.622333 73.897806 65.76149 20.886401 3.875861
Total running time of the script: ( 5 minutes 4.786 seconds)