Note
Click here to download the full example code
Benchmark of PolynomialFeatures + partialfit of SGDClassifier¶
This benchmark looks into a new implementation of PolynomialFeatures proposed in PR13290. It tests the following configurations:
SGD-ONLY: sklearn.linear_model.SGDClassifier only
SGD-SKL: sklearn.preprocessing.PolynomialFeature from scikit-learn (no matter what it is)
SGD-FAST: new implementation copy-pasted in the benchmark source file
SGD-SLOW: implementation of 0.20.2 copy-pasted in the benchmark source file
This example takes the example Benchmark of PolynomialFeatures + partialfit of SGDClassifier (standalone) and rewrites it with module pymlbenchmark.
from pymlbenchmark.plotting import plot_bench_results
from pymlbenchmark.context import machine_information
from time import perf_counter as time
import matplotlib.pyplot as plt
import pandas
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier
try:
from sklearn.utils._testing import ignore_warnings
except ImportError:
from sklearn.utils.testing import ignore_warnings
from mlinsights.mlmodel import ExtendedFeatures
Implementation to benchmark¶
from pymlbenchmark.benchmark import BenchPerf, BenchPerfTest
from pymlbenchmark.datasets import random_binary_classification
class PolyBenchPerfTest(BenchPerfTest):
def __init__(self, dim=None, **opts):
# Models are fitted here. Every not measured
# should take place here.
assert dim is not None
BenchPerfTest.__init__(self, **opts)
self.model1 = SGDClassifier()
self.model2 = make_pipeline(PolynomialFeatures(), SGDClassifier())
self.model3 = make_pipeline(
ExtendedFeatures(kind='poly'), SGDClassifier())
self.model4 = make_pipeline(ExtendedFeatures(
kind='poly-slow'), SGDClassifier())
X, y = random_binary_classification(10000, dim)
self.model1.fit(PolynomialFeatures().fit_transform(X), y)
self.model2.fit(X, y)
self.model3.fit(X, y)
self.model4.fit(X, y)
def data(self, N=None, dim=None):
# The benchmark requires a new datasets each time.
assert N is not None
assert dim is not None
return random_binary_classification(N, dim)
def fcts(self, dim=None, **kwargs):
# The function returns the prediction functions to tests.
def preprocess(X, y):
return PolynomialFeatures().fit_transform(X), y
def partial_fit_model1(X, y, model=self.model1):
return model.partial_fit(X, y)
def partial_fit_model2(X, y, model=self.model2):
X2 = model.steps[0][1].transform(X)
return model.steps[1][1].partial_fit(X2, y)
def partial_fit_model3(X, y, model=self.model3):
X2 = model.steps[0][1].transform(X)
return model.steps[1][1].partial_fit(X2, y)
def partial_fit_model4(X, y, model=self.model4):
X2 = model.steps[0][1].transform(X)
return model.steps[1][1].partial_fit(X2, y)
return [{'test': 'SGD-ONLY', 'fct': (preprocess, partial_fit_model1)},
{'test': 'SGD-SKL', 'fct': partial_fit_model2},
{'test': 'SGD-FAST', 'fct': partial_fit_model3},
{'test': 'SGD-SLOW', 'fct': partial_fit_model4}]
def validate(self, results, **kwargs):
for ind, row, model in results:
assert isinstance(row, dict) # test options
assert isinstance(model, SGDClassifier) # trained model
Benchmark function¶
@ignore_warnings(category=(FutureWarning, DeprecationWarning))
def run_bench(repeat=100, verbose=False):
pbefore = dict(dim=[5, 10, 50])
pafter = dict(N=[10, 100, 1000])
bp = BenchPerf(pbefore, pafter, PolyBenchPerfTest)
with sklearn.config_context(assume_finite=True):
start = time()
results = list(bp.enumerate_run_benchs(repeat=repeat, verbose=verbose))
end = time()
results_df = pandas.DataFrame(results)
print("Total time = %0.3f sec\n" % (end - start))
return results_df
Run the benchmark¶
df = run_bench(verbose=True)
df.to_csv("plot_bench_polynomial_features_partial_fit.perf.csv", index=False)
print(df.head())
0%| | 0/9 [00:00<?, ?it/s]
11%|#1 | 1/9 [00:02<00:22, 2.87s/it]
22%|##2 | 2/9 [00:04<00:15, 2.15s/it]
33%|###3 | 3/9 [00:08<00:17, 2.99s/it]
44%|####4 | 4/9 [00:09<00:11, 2.39s/it]
56%|#####5 | 5/9 [00:12<00:10, 2.52s/it]
67%|######6 | 6/9 [01:56<01:50, 36.96s/it]
78%|#######7 | 7/9 [02:09<00:58, 29.05s/it]
89%|########8 | 8/9 [02:38<00:29, 29.23s/it]
89%|########8 | 8/9 [02:38<00:19, 19.87s/it]
Total time = 158.924 sec
test N dim repeat ... upper count median error_c
0 SGD-ONLY 10 5 100 ... 0.001467 100 0.001397 0
1 SGD-SKL 10 5 100 ... 0.002015 100 0.001954 0
2 SGD-FAST 10 5 100 ... 0.001764 100 0.001666 0
3 SGD-SLOW 10 5 100 ... 0.003053 100 0.002994 0
4 SGD-ONLY 100 5 100 ... 0.001584 100 0.001511 0
[5 rows x 15 columns]
Extract information about the machine used¶
pkgs = ['numpy', 'pandas', 'sklearn']
dfi = pandas.DataFrame(machine_information(pkgs))
dfi.to_csv("plot_bench_polynomial_features_partial_fit.time.csv", index=False)
print(dfi)
name ... value
0 date ... NaN
1 python ... 3.9.1 (default, Jan 18 2021, 16:35:58) \n[GCC ...
2 platform ... linux
3 OS ... Linux-4.19.0-23-amd64-x86_64-with-glibc2.28
4 machine ... x86_64
5 processor ...
6 release ... 4.19.0-23-amd64
7 architecture ... (64bit, ELF)
8 arch ... X86_64
9 brand_raw ... Intel(R) Atom(TM) CPU C2750 @ 2.40GHz
10 count ... 8
11 flags ... 3dnowprefetch acpi aes aperfmperf apic arat ar...
12 hz_advertised ... [2400000000, 0]
13 l1_data_cache_size ... 24576
14 l1_instruction_cache_size ... 32768
15 l2_cache_associativity ... 8
16 l2_cache_line_size ... 1024
17 l2_cache_size ... 1048576
18 l3_cache_size ... 1048576
19 stepping ... 8
20 numpy ... openblas, language=c
21 pandas ... NaN
22 sklearn ... NaN
[23 rows x 3 columns]
Plot the results¶
print(df.columns)
plot_bench_results(df, row_cols='N', col_cols=None,
x_value='dim', hue_cols=None,
cmp_col_values='test',
title="PolynomialFeatures + partial_fit\n"
"Benchmark scikit-learn PR13290")
plt.show()

Index(['test', 'N', 'dim', 'repeat', 'number', 'min', 'max', 'min3', 'max3',
'mean', 'lower', 'upper', 'count', 'median', 'error_c'],
dtype='object')
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:143: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
piv = ds.pivot(*y_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
lower_piv = ds.pivot(*lower_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:164: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
upper_piv = ds.pivot(*upper_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:143: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
piv = ds.pivot(*y_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
lower_piv = ds.pivot(*lower_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:164: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
upper_piv = ds.pivot(*upper_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:143: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
piv = ds.pivot(*y_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
lower_piv = ds.pivot(*lower_cols)
somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:164: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
upper_piv = ds.pivot(*upper_cols)
Total running time of the script: ( 2 minutes 46.584 seconds)