Benchmark of TreeEnsemble implementation#

The following example compares the inference time between onnxruntime and sklearn.ensemble.RandomForestRegressor, fow different number of estimators, max depth, and parallelization. It does it for a fixed number of rows and features.

import and registration of necessary converters#

import pickle
import os
import time
from itertools import product

import matplotlib.pyplot as plt
import numpy
import pandas
from lightgbm import LGBMRegressor
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxruntime import InferenceSession, SessionOptions
from psutil import cpu_count
from pyquickhelper.loghelper import run_cmd
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes
from sklearn import set_config
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
from xgboost import XGBRegressor

def skl2onnx_convert_lightgbm(scope, operator, container):
    options = scope.get_options(operator.raw_operator)
    if "split" in options:
        operator.split = options["split"]
        operator.split = None
    convert_lightgbm(scope, operator, container)

    options={"split": None},

# The following instruction reduces the time spent by scikit-learn
# to validate the data.

Machine details#

print(f"Number of cores: {cpu_count()}")
Number of cores: 8

But this information is not usually enough. Let’s extract the cache information.

    out, err = run_cmd("lscpu")
except Exception as e:
    print(f"lscpu not available: {e}")
<Popen: returncode: None args: ['lscpu']>

Or with the following command.

out, err = run_cmd("cat /proc/cpuinfo")
<Popen: returncode: None args: ['cat', '/proc/cpuinfo']>

Fonction to measure inference time#

def measure_inference(fct, X, repeat, max_time=5, quantile=1):
    Run *repeat* times the same function on data *X*.

    :param fct: fonction to run
    :param X: data
    :param repeat: number of times to run
    :param max_time: maximum time to use to measure the inference
    :return: number of runs, sum of the time, average, median
    times = []
    for n in range(repeat):
        perf = time.perf_counter()
        delta = time.perf_counter() - perf
        if len(times) < 3:
        if max_time is not None and sum(times) >= max_time:
    quantile = 0 if (len(times) - quantile * 2) < 3 else quantile
    if quantile == 0:
        tt = times
        tt = times[quantile:-quantile]
    return (len(times), sum(times), sum(tt) / len(tt), times[len(times) // 2])


The following script benchmarks the inference for the same model for a random forest and onnxruntime after it was converted into ONNX and for the following configurations.

small = cpu_count() < 12
if small:
    N = 1000
    n_features = 10
    n_jobs = [1, cpu_count() // 2, cpu_count()]
    n_ests = [10, 20, 30]
    depth = [4, 6, 8, 10]
    Regressor = RandomForestRegressor
    N = 100000
    n_features = 50
    n_jobs = [cpu_count(), cpu_count() // 2, 1]
    n_ests = [100, 200, 400]
    depth = [6, 8, 10, 12, 14]
    Regressor = RandomForestRegressor

legend = f"parallel-nf-{n_features}-"

# avoid duplicates on machine with 1 or 2 cores.
n_jobs = list(sorted(set(n_jobs), reverse=True))

Benchmark parameters

repeat = 7  # repeat n times the same inference
quantile = 1  # exclude extreme times
max_time = 5  # maximum number of seconds to spend on one configuration


X = numpy.random.randn(N, n_features).astype(numpy.float32)
noise = (numpy.random.randn(X.shape[0]) / (n_features // 5)).astype(numpy.float32)
y = X.mean(axis=1) + noise
n_train = min(N, N // 3)

data = []
couples = list(product(n_jobs, depth, n_ests))
bar = tqdm(couples)
cache_dir = "_cache"
if not os.path.exists(cache_dir):

for n_j, max_depth, n_estimators in bar:
    if n_j == 1 and n_estimators > n_ests[0]:
        # skipping

    # parallelization
    cache_name = os.path.join(
        cache_dir, f"nf-{X.shape[1]}-rf-J-{n_j}-E-{n_estimators}-D-{max_depth}.pkl"
    if os.path.exists(cache_name):
        with open(cache_name, "rb") as f:
            rf = pickle.load(f)
        bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} train rf")
        if n_j == 1 and issubclass(Regressor, RandomForestRegressor):
            rf = Regressor(max_depth=max_depth, n_estimators=n_estimators, n_jobs=-1)
  [:n_train], y[:n_train])
            rf.n_jobs = 1
            rf = Regressor(max_depth=max_depth, n_estimators=n_estimators, n_jobs=n_j)
  [:n_train], y[:n_train])
        with open(cache_name, "wb") as f:
            pickle.dump(rf, f)

    bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} ISession")
    so = SessionOptions()
    so.intra_op_num_threads = n_j
    cache_name = os.path.join(
        cache_dir, f"nf-{X.shape[1]}-rf-J-{n_j}-E-{n_estimators}-D-{max_depth}.onnx"
    if os.path.exists(cache_name):
        sess = InferenceSession(cache_name, so)
        bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} cvt onnx")
        onx = to_onnx(rf, X[:1])
        with open(cache_name, "wb") as f:
        sess = InferenceSession(cache_name, so)
    onx_size = os.stat(cache_name).st_size

    # run once to avoid counting the first run
    bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predict1")
    rf.predict(X), {"X": X})

    # fixed data
    obs = dict(

    # baseline
    bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predictB")
    r, t, mean, med = measure_inference(rf.predict, X, repeat=repeat, max_time=max_time)
    o1 = obs.copy()
    o1.update(dict(avg=mean, med=med, n_runs=r, ttime=t, name="base"))

    # baseline
    bar.set_description(f"J={n_j} E={n_estimators} D={max_depth} predictO")
    r, t, mean, med = measure_inference(
        lambda x:, {"X": x}), X, repeat=repeat, max_time=max_time
    o2 = obs.copy()
    o2.update(dict(avg=mean, med=med, n_runs=r, ttime=t, name="ort_"))
Saving data#

name = os.path.join(cache_dir, "plot_beanchmark_rf")
print(f"Saving data into {name!r}")

df = pandas.DataFrame(data)
df2 = df.copy()
df2["legend"] = legend
df2.to_csv(f"{name}-{legend}.csv", index=False)
Saving data into '_cache/plot_beanchmark_rf'

Printing the data

    n_jobs  max_depth  n_estimators  ...       med  n_runs     ttime
0        8          4            10  ...  0.017314       7  0.123752
1        8          4            10  ...  0.000360       7  0.006377
2        8          4            20  ...  0.024644       7  0.169294
3        8          4            20  ...  0.000430       7  0.003586
4        8          4            30  ...  0.032611       7  0.228959
5        8          4            30  ...  0.000497       7  0.008027
6        8          6            10  ...  0.016544       7  0.117438
7        8          6            10  ...  0.000408       7  0.003168
8        8          6            20  ...  0.024973       7  0.176754
9        8          6            20  ...  0.000509       7  0.003887
10       8          6            30  ...  0.033109       7  0.228971
11       8          6            30  ...  0.000758       7  0.006673
12       8          8            10  ...  0.017663       7  0.124027
13       8          8            10  ...  0.000490       7  0.005954
14       8          8            20  ...  0.024419       7  0.172915
15       8          8            20  ...  0.000552       7  0.007717
16       8          8            30  ...  0.032947       7  0.464705
17       8          8            30  ...  0.000923       7  0.010747
18       8         10            10  ...  0.016950       7  0.121992
19       8         10            10  ...  0.000493       7  0.005688
20       8         10            20  ...  0.025172       7  0.177189
21       8         10            20  ...  0.000621       7  0.007173
22       8         10            30  ...  0.032112       7  0.226256
23       8         10            30  ...  0.000764       7  0.005758
24       4          4            10  ...  0.015495       7  0.107789
25       4          4            10  ...  0.000377       7  0.002934
26       4          4            20  ...  0.022609       7  0.158891
27       4          4            20  ...  0.000516       7  0.003890
28       4          4            30  ...  0.030467       7  0.215163
29       4          4            30  ...  0.000696       7  0.005178
30       4          6            10  ...  0.015338       7  0.107175
31       4          6            10  ...  0.000451       7  0.003470
32       4          6            20  ...  0.023434       7  0.166708
33       4          6            20  ...  0.000627       7  0.004826
34       4          6            30  ...  0.030764       7  0.217064
35       4          6            30  ...  0.000875       7  0.006534
36       4          8            10  ...  0.015330       7  0.108755
37       4          8            10  ...  0.000523       7  0.003982
38       4          8            20  ...  0.023504       7  0.169129
39       4          8            20  ...  0.000749       7  0.005626
40       4          8            30  ...  0.030934       7  0.219070
41       4          8            30  ...  0.001100       7  0.008071
42       4         10            10  ...  0.015704       7  0.113016
43       4         10            10  ...  0.000600       7  0.004487
44       4         10            20  ...  0.023624       7  0.165025
45       4         10            20  ...  0.000879       7  0.008380
46       4         10            30  ...  0.031204       7  0.222933
47       4         10            30  ...  0.001339       7  0.009620
48       1          4            10  ...  0.007009       7  0.049109
49       1          4            10  ...  0.000722       7  0.005244
50       1          6            10  ...  0.007338       7  0.051313
51       1          6            10  ...  0.000924       7  0.006637
52       1          8            10  ...  0.007591       7  0.053217
53       1          8            10  ...  0.001223       7  0.008771
54       1         10            10  ...  0.007874       7  0.055277
55       1         10            10  ...  0.001461       7  0.010455

[56 rows x 13 columns]


n_rows = len(n_jobs)
n_cols = len(n_ests)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows))

for n_j, n_estimators in tqdm(product(n_jobs, n_ests)):
    i = n_jobs.index(n_j)
    j = n_ests.index(n_estimators)
    ax = axes[i, j]

    subdf = df[(df.n_estimators == n_estimators) & (df.n_jobs == n_j)]
    if subdf.shape[0] == 0:
    piv = subdf.pivot(index="max_depth", columns="name", values=["avg", "med"])
    piv.plot(ax=ax, title=f"jobs={n_j}, trees={n_estimators}")
    ax.set_ylabel(f"n_jobs={n_j}", fontsize="small")
    ax.set_xlabel("max_depth", fontsize="small")

    # ratio
    ax2 = ax.twinx()
    piv1 = subdf.pivot(index="max_depth", columns="name", values="avg")
    piv1["speedup"] = piv1.base / piv1.ort_
    ax2.plot(piv1.index, piv1.speedup, "b--", label="speedup avg")

    piv1 = subdf.pivot(index="max_depth", columns="name", values="med")
    piv1["speedup"] = piv1.base / piv1.ort_
    ax2.plot(piv1.index, piv1.speedup, "y--", label="speedup med")

    # 1
    ax2.plot(piv1.index, [1 for _ in piv1.index], "k--", label="no speedup")

for i in range(axes.shape[0]):
    for j in range(axes.shape[1]):
        axes[i, j].legend(fontsize="small")

RandomForestRegressor X.shape=(1000, 10), jobs=8, trees=10, jobs=8, trees=20, jobs=8, trees=30, jobs=4, trees=10, jobs=4, trees=20, jobs=4, trees=30, jobs=1, trees=10
0it [00:00, ?it/s]
1it [00:00,  3.88it/s]
2it [00:00,  3.89it/s]
3it [00:00,  3.90it/s]
4it [00:01,  3.86it/s]
5it [00:01,  3.86it/s]
6it [00:01,  3.82it/s]
7it [00:01,  3.80it/s]
9it [00:01,  4.91it/s]
Total running time of the script: ( 0 minutes 44.688 seconds)

