Memory usage

Links: notebook, html, PDF, python, slides, GitHub

The first benchmark based on scikti-learn’s benchmark shows high peaks of memory usage for the python runtime on linear models. Let’s see how to measure that.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

Artificial huge data

import numpy
N, nfeat = 300000, 200
N * nfeat * 8 / 1e9
0.48
X = numpy.random.random((N, nfeat))
y = numpy.empty((N, 50))
for i in range(y.shape[1]):
    y[:, i] = X.sum(axis=1) + numpy.random.random(N)
X.shape, y.shape
((300000, 200), (300000, 50))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(X_train, y_train)
LinearRegression()
from mlprodict.onnx_conv import to_onnx
from mlprodict.onnxrt import OnnxInference
clr_onnx = to_onnx(clr, X_train[:1].astype(numpy.float32))
oinfpy = OnnxInference(clr_onnx, runtime='python')

Let’s minimize the cost of verifications on scikit-learn’s side.

from sklearn import set_config
set_config(assume_finite=True)

Profiling the prediction function

from pyquickhelper.pycode.profiling import profile
print(profile(lambda: clr.predict(X_test),
              pyinst_format='text')[1])
  _     ._   __/__   _ _  _  _ _/_   Recorded: 15:51:37  Samples:  4
 /_//_/// /_/ //_// / //_'/ //     Duration: 0.439     CPU time: 0.797
/   _/                      v3.0.1
Program: c:python372_x64libsite-packagesipykernel_launcher.py -f C:UsersxavieAppDataRoamingjupyterruntimekernel-4e37b7b5-7bfc-4784-9e5a-cae5acd320c1.json
0.439 profile  pyquickhelperpycodeprofiling.py:49
|- 0.427 <lambda>  <ipython-input-12-1097e70fe6c7>:2
|  `- 0.427 predict  sklearnlinear_model_base.py:222
|     `- 0.427 _decision_function  sklearnlinear_model_base.py:215
|        |- 0.371 inner_f  sklearnutilsvalidation.py:60
|        |  `- 0.370 safe_sparse_dot  sklearnutilsextmath.py:118
|        `- 0.056 [self]
`- 0.012 [self]
import numpy

def nastype32(mat):
    return mat.astype(numpy.float32)

print(profile(lambda: oinfpy.run({'X': nastype32(X_test)}),
              pyinst_format='text')[1])
  _     ._   __/__   _ _  _  _ _/_   Recorded: 15:51:39  Samples:  5
 /_//_/// /_/ //_// / //_'/ //     Duration: 0.378     CPU time: 0.453
/   _/                      v3.0.1
Program: c:python372_x64libsite-packagesipykernel_launcher.py -f C:UsersxavieAppDataRoamingjupyterruntimekernel-4e37b7b5-7bfc-4784-9e5a-cae5acd320c1.json
0.378 profile  pyquickhelperpycodeprofiling.py:49
|- 0.370 <lambda>  <ipython-input-13-da4aa05db7ed>:6
|  |- 0.233 run  mlprodictonnxrtonnx_inference.py:471
|  |  `- 0.233 _run_sequence_runtime  mlprodictonnxrtonnx_inference.py:551
|  |     `- 0.233 run  mlprodictonnxrtonnx_inference_node.py:141
|  |        `- 0.233 run  mlprodictonnxrtops_cpu_op.py:374
|  |           `- 0.233 run  mlprodictonnxrtops_cpu_op.py:289
|  |              `- 0.233 _run  mlprodictonnxrtops_cpuop_linear_regressor.py:27
|  |                 |- 0.215 numpy_dot_inplace  mlprodictonnxrtops_cpu_op_numpy_helper.py:8
|  |                 |  `- 0.215 dot  <__array_function__ internals>:2
|  |                 `- 0.018 [self]
|  |- 0.112 nastype32  <ipython-input-13-da4aa05db7ed>:3
|  `- 0.026 [self]
`- 0.008 [self]

Most of the time is taken out into casting into float. Let’s take it out.

X_test32 = X_test.astype(numpy.float32)

print(profile(lambda: oinfpy.run({'X': X_test32}),
              pyinst_format='text')[1])
  _     ._   __/__   _ _  _  _ _/_   Recorded: 15:51:43  Samples:  3
 /_//_/// /_/ //_// / //_'/ //     Duration: 0.081     CPU time: 0.141
/   _/                      v3.0.1
Program: c:python372_x64libsite-packagesipykernel_launcher.py -f C:UsersxavieAppDataRoamingjupyterruntimekernel-4e37b7b5-7bfc-4784-9e5a-cae5acd320c1.json
0.080 profile  pyquickhelperpycodeprofiling.py:49
|- 0.074 <lambda>  <ipython-input-14-fe055596e921>:3
|  `- 0.074 run  mlprodictonnxrtonnx_inference.py:471
|     `- 0.074 _run_sequence_runtime  mlprodictonnxrtonnx_inference.py:551
|        `- 0.074 run  mlprodictonnxrtonnx_inference_node.py:141
|           `- 0.074 run  mlprodictonnxrtops_cpu_op.py:374
|              `- 0.074 run  mlprodictonnxrtops_cpu_op.py:289
|                 `- 0.074 _run  mlprodictonnxrtops_cpuop_linear_regressor.py:27
|                    |- 0.059 numpy_dot_inplace  mlprodictonnxrtops_cpu_op_numpy_helper.py:8
|                    |  `- 0.059 dot  <__array_function__ internals>:2
|                    `- 0.015 [self]
`- 0.007 [self]

Much better.

SGDClasifier

This models is implemented with many ONNX nodes. Let’s how it behaves.

from sklearn.linear_model import SGDClassifier
from sklearn.datasets import load_iris
data = load_iris()
Xir, yir = data.data, data.target
Xir_train, Xir_test, yir_train, yir_test = train_test_split(Xir, yir)
sgcl = SGDClassifier()
sgcl.fit(Xir_train, yir_train)
SGDClassifier()
sgd_onnx = to_onnx(sgcl, Xir_train.astype(numpy.float32))
C:xavierdupre__home_github_forkscikit-learnsklearnutilsdeprecation.py:101: FutureWarning: Attribute average_coef_ was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn(msg, category=FutureWarning)
C:xavierdupre__home_github_forkscikit-learnsklearnutilsdeprecation.py:101: FutureWarning: Attribute average_intercept_ was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn(msg, category=FutureWarning)
C:xavierdupre__home_github_forkscikit-learnsklearnutilsdeprecation.py:101: FutureWarning: Attribute standard_coef_ was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn(msg, category=FutureWarning)
C:xavierdupre__home_github_forkscikit-learnsklearnutilsdeprecation.py:101: FutureWarning: Attribute standard_intercept_ was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn(msg, category=FutureWarning)
%load_ext mlprodict
%onnxview sgd_onnx
sgd_oinf = OnnxInference(sgd_onnx)
def call_n_times_x1(n, X_test, sgd_oinf):
    for i in range(n):
        res = sgd_oinf.run({'X': X_test})
    return res

call_n_times_x1(20, Xir_test[:1].astype(numpy.float32), sgd_oinf)
{'output_label': array([0], dtype=int64),
 'output_probability': [{0: -65.8407, 1: -158.60867, 2: -100.55802}]}
sgcl.decision_function(Xir_test[:1])
array([[ -65.840706  , -158.60864916, -100.55799704]])
xir_32 = Xir_test[:1].astype(numpy.float32)

print(profile(lambda: call_n_times_x1(20000, xir_32, sgd_oinf),
              pyinst_format='text')[1])
  _     ._   __/__   _ _  _  _ _/_   Recorded: 15:52:03  Samples:  1022
 /_//_/// /_/ //_// / //_'/ //     Duration: 1.432     CPU time: 1.453
/   _/                      v3.0.1
Program: c:python372_x64libsite-packagesipykernel_launcher.py -f C:UsersxavieAppDataRoamingjupyterruntimekernel-4e37b7b5-7bfc-4784-9e5a-cae5acd320c1.json
1.432 profile  pyquickhelperpycodeprofiling.py:49
`- 1.432 <lambda>  <ipython-input-22-ec5a6181dc40>:3
   `- 1.432 call_n_times_x1  <ipython-input-20-32f502ef162e>:1
      |- 1.412 run  mlprodictonnxrtonnx_inference.py:471
      |  |- 1.381 _run_sequence_runtime  mlprodictonnxrtonnx_inference.py:551
      |  |  |- 1.218 run  mlprodictonnxrtonnx_inference_node.py:141
      |  |  |  |- 0.398 [self]
      |  |  |  |- 0.311 run  mlprodictonnxrtops_cpu_op.py:132
      |  |  |  |  |- 0.193 _run  mlprodictonnxrtops_cpuop_array_feature_extractor.py:59
      |  |  |  |  |  |- 0.170 _array_feature_extrator  mlprodictonnxrtops_cpuop_array_feature_extractor.py:17
      |  |  |  |  |  `- 0.023 [self]
      |  |  |  |  |- 0.047 _run  mlprodictonnxrtops_cpuop_cast.py:37
      |  |  |  |  |  `- 0.033 _run_inplace  mlprodictonnxrtops_cpuop_cast.py:42
      |  |  |  |  |     `- 0.020 <lambda>  mlprodictonnxrtops_cpuop_cast.py:35
      |  |  |  |  |- 0.028 [self]
      |  |  |  |  |- 0.022 _run  mlprodictonnxrtops_cpuop_zipmap.py:221
      |  |  |  |  `- 0.021 _run  mlprodictonnxrtops_cpuop_reshape.py:16
      |  |  |  |- 0.299 run  mlprodictonnxrtops_cpu_op.py:337
      |  |  |  |  `- 0.287 run  mlprodictonnxrtops_cpu_op.py:289
      |  |  |  |     `- 0.281 _run  mlprodictonnxrtops_cpuop_argmax.py:69
      |  |  |  |        `- 0.277 _run  mlprodictonnxrtops_cpuop_argmax.py:42
      |  |  |  |           `- 0.271 _argmax  mlprodictonnxrtops_cpuop_argmax.py:12
      |  |  |  |              |- 0.159 expand_dims  <__array_function__ internals>:2
      |  |  |  |              |  `- 0.155 expand_dims  numpylibshape_base.py:512
      |  |  |  |              |        [10 frames hidden]  numpy
      |  |  |  |              |- 0.059 argmax  <__array_function__ internals>:2
      |  |  |  |              |  |- 0.041 argmax  numpycorefromnumeric.py:1112
      |  |  |  |              |  |     [4 frames hidden]  numpy
      |  |  |  |              |  `- 0.018 [self]
      |  |  |  |              `- 0.052 [self]
      |  |  |  |- 0.171 run  mlprodictonnxrtops_cpu_op.py:517
      |  |  |  |  |- 0.155 run  mlprodictonnxrtops_cpu_op.py:453
      |  |  |  |  |  |- 0.075 _run  mlprodictonnxrtops_cpu_op.py:550
      |  |  |  |  |  `- 0.067 _run  mlprodictonnxrtops_cpuop_matmul.py:16
      |  |  |  |  |     `- 0.066 numpy_dot_inplace  mlprodictonnxrtops_cpu_op_numpy_helper.py:8
      |  |  |  |  |        `- 0.055 dot  <__array_function__ internals>:2
      |  |  |  |  `- 0.016 [self]
      |  |  |  `- 0.038 <genexpr>  mlprodictonnxrtonnx_inference_node.py:153
      |  |  `- 0.158 [self]
      |  `- 0.031 [self]
      `- 0.020 [self]

The code in mlprodict/onnxrt/onnx_inference_node.py just calls an operator and updates the list containing all the results. The time in here is significant if the number of node is huge if the python runtime is used.

Memory profiling

%matplotlib inline
from memory_profiler import memory_usage
memprof_skl = memory_usage((clr.predict, (X_test, )), timestamps=True, interval=0.01)
memprof_skl
[(811.3515625, 1594129928.0175571),
 (811.671875, 1594129932.2684996),
 (822.36328125, 1594129932.28645),
 (832.11328125, 1594129932.30241),
 (847.05078125, 1594129932.3183646),
 (860.5625, 1594129932.333325),
 (874.48828125, 1594129932.3482847),
 (883.73828125, 1594129932.3642418),
 (898.80078125, 1594129932.380199),
 (907.98828125, 1594129932.3961573),
 (935.03515625, 1594129932.4121134),
 (965.03515625, 1594129932.4280717),
 (998.59765625, 1594129932.4440289),
 (949.73828125, 1594129932.4599853),
 (914.75390625, 1594129932.464972)]
import matplotlib.pyplot as plt
from pandas import DataFrame, to_datetime

def mem_profile_plot(mem, title):
    fig, ax = plt.subplots(1, 1, figsize=(4, 4))
    df = DataFrame(mem, columns=["memory", "timestamp"])
    df["timestamp"] = to_datetime(df.timestamp)
    df["timestamp"] -= df.timestamp.min()
    df.set_index("timestamp").plot(ax=ax)
    ax.set_title(title + "\nmemory usage")
    return ax

mem_profile_plot(memprof_skl, "clr.predict");
../_images/onnx_profile_30_0.png
memprof_onx = memory_usage((oinfpy.run, ({'X': X_test32}, )), timestamps=True, interval=0.01)
mem_profile_plot(memprof_onx, "oinfpy.run");
../_images/onnx_profile_31_0.png
memprof_onx2 = memory_usage((oinfpy.run, ({'X': X_test.astype(numpy.float32, copy=False)}, )),
                           timestamps=True, interval=0.01)
mem_profile_plot(memprof_onx2, "oinfpy.run + astype(numpy.float32)");
../_images/onnx_profile_32_0.png

This is not very informative.

Memory profiling outside the notebook

More precise.

%%writefile mprof_clr_predict.py

import numpy
N, nfeat = 300000, 200
X = numpy.random.random((N, nfeat))
y = numpy.empty((N, 50))
for i in range(y.shape[1]):
    y[:, i] = X.sum(axis=1) + numpy.random.random(N)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)

from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(X_train, y_train)

from sklearn import set_config
set_config(assume_finite=True)

from memory_profiler import profile
@profile
def clr_predict():
    clr.predict(X_test)

clr_predict()
Overwriting mprof_clr_predict.py
!python -m memory_profiler mprof_clr_predict.py --timestamp
Filename: mprof_clr_predict.py
Line #    Mem usage    Increment   Line Contents
================================================
    20   1234.7 MiB   1234.7 MiB   @profile
    21                             def clr_predict():

The notebook seems to increase the memory usage.

%%writefile mprof_onnx_run.py

import numpy
N, nfeat = 300000, 200
X = numpy.random.random((N, nfeat))
y = numpy.empty((N, 50))
for i in range(y.shape[1]):
    y[:, i] = X.sum(axis=1) + numpy.random.random(N)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)

from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(X_train, y_train)

from mlprodict.onnx_conv import to_onnx
from mlprodict.onnxrt import OnnxInference
clr_onnx = to_onnx(clr, X_train[:1].astype(numpy.float32))
oinfpy = OnnxInference(clr_onnx, runtime='python')
X_test32 = X_test.astype(numpy.float32)

from sklearn import set_config
set_config(assume_finite=True)

from memory_profiler import profile
@profile
def oinfpy_predict():
    oinfpy.run({'X': X_test32})

oinfpy_predict()
Overwriting mprof_onnx_run.py
!python -m memory_profiler mprof_onnx_run.py --timestamp
Filename: mprof_onnx_run.py
Line #    Mem usage    Increment   Line Contents
================================================
    26   1498.8 MiB   1498.8 MiB   @profile
    27                             def oinfpy_predict():
    28   1500.1 MiB      1.3 MiB       oinfpy.run({'X': X_test32})
%%writefile mprof_onnx_run32.py

import numpy
N, nfeat = 300000, 200
X = numpy.random.random((N, nfeat))
y = numpy.empty((N, 50))
for i in range(y.shape[1]):
    y[:, i] = X.sum(axis=1) + numpy.random.random(N)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1)

from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(X_train, y_train)

from mlprodict.onnx_conv import to_onnx
from mlprodict.onnxrt import OnnxInference
clr_onnx = to_onnx(clr, X_train[:1].astype(numpy.float32))
oinfpy = OnnxInference(clr_onnx, runtime='python')

from sklearn import set_config
set_config(assume_finite=True)

from memory_profiler import profile
@profile
def oinfpy_predict32():
    oinfpy.run({'X': X_test.astype(numpy.float32)})

oinfpy_predict32()
Overwriting mprof_onnx_run32.py
!python -m memory_profiler mprof_onnx_run32.py --timestamp
Filename: mprof_onnx_run32.py
Line #    Mem usage    Increment   Line Contents
================================================
    25   1293.1 MiB   1293.1 MiB   @profile
    26                             def oinfpy_predict32():
    27   1294.4 MiB      1.3 MiB       oinfpy.run({'X': X_test.astype(numpy.float32)})