Benchmark Random Forests, Tree Ensemble, (AoS and SoA)#

The script compares different implementations for the operator TreeEnsembleRegressor.

  • baseline: RandomForestRegressor from scikit-learn

  • ort: onnxruntime,

  • mlprodict: an implementation based on an array of structures, every structure describes a node,

  • mlprodict2 similar implementation but instead of having an array of structures, it relies on a structure of arrays, it parallelizes by blocks of 128 observations and inside every block, goes through trees then through observations (double loop),

  • mlprodict3: parallelizes by trees, this implementation is faster when the depth is higher than 10.

A structure of arrays has better performance: Case study: Comparing Arrays of Structures and Structures of Arrays Data Layouts for a Compute-Intensive Loop. See also AoS and SoA.

Profile the execution

py-spy can be used to profile the execution of a program. The profile is more informative if the code is compiled with debug information.

py-spy record --native -r 10 -o plot_random_forest_reg.svg -- python plot_random_forest_reg.py

Import#

import warnings
from time import perf_counter as time
from multiprocessing import cpu_count
import numpy
from numpy.random import rand
from numpy.testing import assert_almost_equal
import pandas
import matplotlib.pyplot as plt
from sklearn import config_context
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils._testing import ignore_warnings
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession
from mlprodict.onnxrt import OnnxInference

Available optimisation on this machine.

from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
print(code_optimisation())
AVX-omp=8

Versions#

def version():
    from datetime import datetime
    import sklearn
    import numpy
    import onnx
    import onnxruntime
    import skl2onnx
    import mlprodict
    df = pandas.DataFrame([
        {"name": "date", "version": str(datetime.now())},
        {"name": "numpy", "version": numpy.__version__},
        {"name": "scikit-learn", "version": sklearn.__version__},
        {"name": "onnx", "version": onnx.__version__},
        {"name": "onnxruntime", "version": onnxruntime.__version__},
        {"name": "skl2onnx", "version": skl2onnx.__version__},
        {"name": "mlprodict", "version": mlprodict.__version__},
    ])
    return df


version()
name version
0 date 2023-01-26 10:14:17.077498
1 numpy 1.23.5
2 scikit-learn 1.2.1
3 onnx 1.13.0
4 onnxruntime 1.13.1
5 skl2onnx 1.13.1
6 mlprodict 0.9.1887


Implementations to benchmark#

def fcts_model(X, y, max_depth, n_estimators, n_jobs):
    "RandomForestClassifier."
    rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators,
                               n_jobs=n_jobs)
    rf.fit(X, y)

    initial_types = [('X', FloatTensorType([None, X.shape[1]]))]
    onx = convert_sklearn(rf, initial_types=initial_types)
    sess = InferenceSession(onx.SerializeToString())
    outputs = [o.name for o in sess.get_outputs()]
    oinf = OnnxInference(onx, runtime="python")
    oinf.sequence_[0].ops_._init(numpy.float32, 1)
    name = outputs[0]
    oinf2 = OnnxInference(onx, runtime="python")
    oinf2.sequence_[0].ops_._init(numpy.float32, 2)
    oinf3 = OnnxInference(onx, runtime="python")
    oinf3.sequence_[0].ops_._init(numpy.float32, 3)

    def predict_skl_predict(X, model=rf):
        return rf.predict(X)

    def predict_onnxrt_predict(X, sess=sess):
        return sess.run(outputs[:1], {'X': X})[0]

    def predict_onnx_inference(X, oinf=oinf):
        return oinf.run({'X': X})[name]

    def predict_onnx_inference2(X, oinf2=oinf2):
        return oinf2.run({'X': X})[name]

    def predict_onnx_inference3(X, oinf3=oinf3):
        return oinf3.run({'X': X})[name]

    return {'predict': (
        predict_skl_predict, predict_onnxrt_predict,
        predict_onnx_inference, predict_onnx_inference2,
        predict_onnx_inference3)}

Benchmarks#

def allow_configuration(**kwargs):
    return True


def bench(n_obs, n_features, max_depths, n_estimatorss, n_jobss,
          methods, repeat=10, verbose=False):
    res = []
    for nfeat in n_features:

        ntrain = 50000
        X_train = numpy.empty((ntrain, nfeat)).astype(numpy.float32)
        X_train[:, :] = rand(ntrain, nfeat)[:, :]
        eps = rand(ntrain) - 0.5
        y_train = X_train.sum(axis=1) + eps

        for n_jobs in n_jobss:
            for max_depth in max_depths:
                for n_estimators in n_estimatorss:
                    fcts = fcts_model(X_train, y_train,
                                      max_depth, n_estimators, n_jobs)

                    for n in n_obs:
                        for method in methods:

                            fct1, fct2, fct3, fct4, fct5 = fcts[method]

                            if not allow_configuration(
                                    n=n, nfeat=nfeat, max_depth=max_depth,
                                    n_estimator=n_estimators, n_jobs=n_jobs,
                                    method=method):
                                continue

                            obs = dict(n_obs=n, nfeat=nfeat,
                                       max_depth=max_depth,
                                       n_estimators=n_estimators,
                                       method=method,
                                       n_jobs=n_jobs)

                            # creates different inputs to avoid caching
                            Xs = []
                            for r in range(repeat):
                                x = numpy.empty((n, nfeat))
                                x[:, :] = rand(n, nfeat)[:, :]
                                Xs.append(x.astype(numpy.float32))

                            # measures the baseline
                            with config_context(assume_finite=True):
                                st = time()
                                repeated = 0
                                for X in Xs:
                                    p1 = fct1(X)
                                    repeated += 1
                                    if time() - st >= 1:
                                        break  # stops if longer than a second
                                end = time()
                                obs["time_skl"] = (end - st) / repeated

                            # measures the new implementation
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct2(X)
                                r2 += 1
                                if r2 >= repeated:
                                    break
                            end = time()
                            obs["time_ort"] = (end - st) / r2

                            # measures the other new implementation
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct3(X)
                                r2 += 1
                                if r2 >= repeated:
                                    break
                            end = time()
                            obs["time_mlprodict"] = (end - st) / r2

                            # measures the other new implementation 2
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct4(X)
                                r2 += 1
                                if r2 >= repeated:
                                    break
                            end = time()
                            obs["time_mlprodict2"] = (end - st) / r2

                            # measures the other new implementation 3
                            st = time()
                            r2 = 0
                            for X in Xs:
                                p2 = fct5(X)
                                r2 += 1
                                if r2 >= repeated:
                                    break
                            end = time()
                            obs["time_mlprodict3"] = (end - st) / r2

                            # final
                            res.append(obs)
                            if verbose and (len(res) % 1 == 0 or n >= 10000):
                                print("bench", len(res), ":", obs)

                            # checks that both produce the same outputs
                            if n <= 10000:
                                if len(p1.shape) == 1 and len(p2.shape) == 2:
                                    p2 = p2.ravel()
                                try:
                                    assert_almost_equal(
                                        p1.ravel(), p2.ravel(), decimal=5)
                                except AssertionError as e:
                                    warnings.warn(str(e))
    return res

Graphs#

def plot_rf_models(dfr):

    def autolabel(ax, rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:1.1f}x',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom',
                        fontsize=8)

    engines = [_.split('_')[-1] for _ in dfr.columns if _.startswith("time_")]
    engines = [_ for _ in engines if _ != 'skl']
    for engine in engines:
        dfr[f"speedup_{engine}"] = dfr["time_skl"] / dfr[f"time_{engine}"]
    print(dfr.tail().T)

    ncols = 4
    fig, axs = plt.subplots(len(engines), ncols, figsize=(
        14, 4 * len(engines)), sharey=True)

    row = 0
    for row, engine in enumerate(engines):
        pos = 0
        name = f"RandomForestRegressor - {engine}"
        for max_depth in sorted(set(dfr.max_depth)):
            for nf in sorted(set(dfr.nfeat)):
                for est in sorted(set(dfr.n_estimators)):
                    for n_jobs in sorted(set(dfr.n_jobs)):
                        sub = dfr[(dfr.max_depth == max_depth) &
                                  (dfr.nfeat == nf) &
                                  (dfr.n_estimators == est) &
                                  (dfr.n_jobs == n_jobs)]
                        ax = axs[row, pos]
                        labels = sub.n_obs
                        means = sub[f"speedup_{engine}"]

                        x = numpy.arange(len(labels))
                        width = 0.90

                        rects1 = ax.bar(x, means, width, label='Speedup')
                        if pos == 0:
                            ax.set_yscale('log')
                            ax.set_ylim([0.1, max(dfr[f"speedup_{engine}"])])

                        if pos == 0:
                            ax.set_ylabel('Speedup')
                        ax.set_title(
                            '%s\ndepth %d - %d features\n %d estimators %d '
                            'jobs' % (name, max_depth, nf, est, n_jobs))
                        if row == len(engines) - 1:
                            ax.set_xlabel('batch size')
                        ax.set_xticks(x)
                        ax.set_xticklabels(labels)
                        autolabel(ax, rects1)
                        for tick in ax.xaxis.get_major_ticks():
                            tick.label.set_fontsize(8)
                        for tick in ax.yaxis.get_major_ticks():
                            tick.label.set_fontsize(8)
                        pos += 1

    fig.tight_layout()
    return fig, ax

Run benchs#

@ignore_warnings(category=FutureWarning)
def run_bench(repeat=100, verbose=False):
    n_obs = [1, 10, 100, 1000, 10000]
    methods = ['predict']
    n_features = [30]
    max_depths = [6, 8, 10, 12]
    n_estimatorss = [100]
    n_jobss = [cpu_count()]

    start = time()
    results = bench(n_obs, n_features, max_depths, n_estimatorss, n_jobss,
                    methods, repeat=repeat, verbose=verbose)
    end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec cpu=%d\n" % (end - start, cpu_count()))

    # plot the results
    return results_df


name = "plot_random_forest_reg"
df = run_bench(verbose=True)
df.to_csv(f"{name}.csv", index=False)
df.to_excel(f"{name}.xlsx", index=False)
fig, ax = plot_rf_models(df)
fig.savefig(f"{name}.png")
plt.show()
RandomForestRegressor - ort depth 6 - 30 features  100 estimators 8 jobs, RandomForestRegressor - ort depth 8 - 30 features  100 estimators 8 jobs, RandomForestRegressor - ort depth 10 - 30 features  100 estimators 8 jobs, RandomForestRegressor - ort depth 12 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict depth 6 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict depth 8 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict depth 10 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict depth 12 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict2 depth 6 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict2 depth 8 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict2 depth 10 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict2 depth 12 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict3 depth 6 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict3 depth 8 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict3 depth 10 - 30 features  100 estimators 8 jobs, RandomForestRegressor - mlprodict3 depth 12 - 30 features  100 estimators 8 jobs
bench 1 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.043522358821381044, 'time_ort': 0.000966933039624406, 'time_mlprodict': 0.0023239386146483216, 'time_mlprodict2': 6.138283313940401e-05, 'time_mlprodict3': 5.920417606830597e-05}
bench 2 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.04179508175002411, 'time_ort': 0.0004311122077827652, 'time_mlprodict': 9.501399472355843e-05, 'time_mlprodict2': 0.0002291614170341442, 'time_mlprodict3': 8.111457767275472e-05}
bench 3 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.046121531864628196, 'time_ort': 0.0009078537748957223, 'time_mlprodict': 0.0026506475829096003, 'time_mlprodict2': 0.0006340589077973908, 'time_mlprodict3': 0.00017222867939959872}
bench 4 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.06601385137764737, 'time_ort': 0.0018302304379176348, 'time_mlprodict': 0.0050989934970857576, 'time_mlprodict2': 0.005368530823034234, 'time_mlprodict3': 0.0012545301287900656}
bench 5 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 6, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.0890412821706074, 'time_ort': 0.016116651745202642, 'time_mlprodict': 0.01661338492219026, 'time_mlprodict2': 0.010571761580649763, 'time_mlprodict3': 0.011603824173410734}
bench 6 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.04327450553925397, 'time_ort': 7.10517536693563e-05, 'time_mlprodict': 0.0022573787427973002, 'time_mlprodict2': 6.53064150052766e-05, 'time_mlprodict3': 6.276016938500106e-05}
bench 7 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.0425184375878113, 'time_ort': 0.0006993165394912163, 'time_mlprodict': 0.00033775775227695704, 'time_mlprodict2': 0.0006189310806803405, 'time_mlprodict3': 9.746858268044889e-05}
bench 8 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.04521715212815806, 'time_ort': 0.000907271918233322, 'time_mlprodict': 0.0030274070070489593, 'time_mlprodict2': 0.0010632537811508646, 'time_mlprodict3': 0.00023952448416663253}
bench 9 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.06683690845966339, 'time_ort': 0.0029535561334341764, 'time_mlprodict': 0.006857608476032813, 'time_mlprodict2': 0.008710083939755956, 'time_mlprodict3': 0.0017239815400292475}
bench 10 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 8, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.088407274413233, 'time_ort': 0.023452326752400648, 'time_mlprodict': 0.03608330533218881, 'time_mlprodict2': 0.018711901580293972, 'time_mlprodict3': 0.016169406997505575}
bench 11 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.04291946887193868, 'time_ort': 0.00010073812639651199, 'time_mlprodict': 0.0023651322117075324, 'time_mlprodict2': 7.061512830356757e-05, 'time_mlprodict3': 6.846054263102512e-05}
bench 12 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.043205117127702884, 'time_ort': 0.0009078622873251637, 'time_mlprodict': 0.0004598619125317782, 'time_mlprodict2': 0.0009409177582710981, 'time_mlprodict3': 0.00019884129869751632}
bench 13 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.054252566524634234, 'time_ort': 0.001753107693634535, 'time_mlprodict': 0.0042793904676249155, 'time_mlprodict2': 0.0016905888424892175, 'time_mlprodict3': 0.0007021306327691203}
bench 14 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.06563319530687295, 'time_ort': 0.005373259496991523, 'time_mlprodict': 0.009688018573797308, 'time_mlprodict2': 0.014862701369565912, 'time_mlprodict3': 0.004831281316000968}
bench 15 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 10, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.09056370883869629, 'time_ort': 0.035183390408443906, 'time_mlprodict': 0.06634832916703697, 'time_mlprodict2': 0.036599469177114465, 'time_mlprodict3': 0.046151463757269084}
bench 16 : {'n_obs': 1, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.0429855689969069, 'time_ort': 9.491279100378354e-05, 'time_mlprodict': 0.0022313590064489595, 'time_mlprodict2': 7.53804633859545e-05, 'time_mlprodict3': 7.772545601862173e-05}
bench 17 : {'n_obs': 10, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.044195184474001115, 'time_ort': 0.001168547481622385, 'time_mlprodict': 0.0002531804367090049, 'time_mlprodict2': 0.0013627554796150198, 'time_mlprodict3': 0.0002931539151493622}
bench 18 : {'n_obs': 100, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.056304315713027284, 'time_ort': 0.001885780508423017, 'time_mlprodict': 0.003985001058835123, 'time_mlprodict2': 0.002384304780409568, 'time_mlprodict3': 0.0010702139439268245}
bench 19 : {'n_obs': 1000, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.0698889575432986, 'time_ort': 0.009377015056088567, 'time_mlprodict': 0.012755915460487207, 'time_mlprodict2': 0.021200994200383624, 'time_mlprodict3': 0.00855348485832413}
bench 20 : {'n_obs': 10000, 'nfeat': 30, 'max_depth': 12, 'n_estimators': 100, 'method': 'predict', 'n_jobs': 8, 'time_skl': 0.09663259108889509, 'time_ort': 0.07558634409426966, 'time_mlprodict': 0.09629837364297021, 'time_mlprodict2': 0.06625216780230403, 'time_mlprodict3': 0.08335775382478129}
Total time = 291.746 sec cpu=8

                            15          16         17        18        19
n_obs                        1          10        100      1000     10000
nfeat                       30          30         30        30        30
max_depth                   12          12         12        12        12
n_estimators               100         100        100       100       100
method                 predict     predict    predict   predict   predict
n_jobs                       8           8          8         8         8
time_skl              0.042986    0.044195   0.056304  0.069889  0.096633
time_ort              0.000095    0.001169   0.001886  0.009377  0.075586
time_mlprodict        0.002231    0.000253   0.003985  0.012756  0.096298
time_mlprodict2       0.000075    0.001363   0.002384  0.021201  0.066252
time_mlprodict3       0.000078    0.000293    0.00107  0.008553  0.083358
speedup_ort         452.895427   37.820615  29.857301   7.45322   1.27844
speedup_mlprodict      19.2643  174.560029  14.129059  5.478945  1.003471
speedup_mlprodict2  570.248139   32.430752  23.614563  3.296494  1.458557
speedup_mlprodict3  553.043638  150.757613  52.610336  8.170817  1.159251
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_opml_random_forest_reg.py:323: MatplotlibDeprecationWarning: The label function was deprecated in Matplotlib 3.1 and will be removed in 3.8. Use Tick.label1 instead.
  tick.label.set_fontsize(8)
somewhere/workspace/mlprodict/mlprodict_UT_39_std/_doc/examples/plot_opml_random_forest_reg.py:325: MatplotlibDeprecationWarning: The label function was deprecated in Matplotlib 3.1 and will be removed in 3.8. Use Tick.label1 instead.
  tick.label.set_fontsize(8)

Total running time of the script: ( 5 minutes 6.064 seconds)

Gallery generated by Sphinx-Gallery