Note
Click here to download the full example code
Benchmark of PolynomialFeatures + partialfit of SGDClassifier (standalone)¶
This benchmark looks into a new implementation of PolynomialFeatures proposed in PR13290. It tests the following configurations:
SGD: sklearn.linear_model.SGDClassifier only
SGD-SKL: sklearn.preprocessing.PolynomialFeatures from scikit-learn (no matter what it is)
SGD-FAST: new implementation copy-pasted in the benchmark source file
SGD-SLOW: implementation of 0.20.2 copy-pasted in the benchmark source file
This script is standalone and does not require pymlbenchmark as opposed to Benchmark of PolynomialFeatures + partialfit of SGDClassifier which reuse functions implemented in pymlbenchmark.
from time import perf_counter as time
import numpy
import numpy as np
from numpy.random import rand
import matplotlib.pyplot as plt
import pandas
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier
try:
from sklearn.utils._testing import ignore_warnings
except ImportError:
from sklearn.utils.testing import ignore_warnings
from mlinsights.mlmodel import ExtendedFeatures
Implementations to benchmark¶
def fcts_model(X, y):
model1 = SGDClassifier()
model2 = make_pipeline(PolynomialFeatures(), SGDClassifier())
model3 = make_pipeline(ExtendedFeatures(kind='poly'), SGDClassifier())
model4 = make_pipeline(ExtendedFeatures(kind='poly-slow'), SGDClassifier())
model1.fit(PolynomialFeatures().fit_transform(X), y)
model2.fit(X, y)
model3.fit(X, y)
model4.fit(X, y)
def partial_fit_model1(X, y, model=model1):
return model.partial_fit(X, y)
def partial_fit_model2(X, y, model=model2):
X2 = model.steps[0][1].transform(X)
return model.steps[1][1].partial_fit(X2, y)
def partial_fit_model3(X, y, model=model3):
X2 = model.steps[0][1].transform(X)
return model.steps[1][1].partial_fit(X2, y)
def partial_fit_model4(X, y, model=model4):
X2 = model.steps[0][1].transform(X)
return model.steps[1][1].partial_fit(X2, y)
return (partial_fit_model1, partial_fit_model2,
partial_fit_model3, partial_fit_model4)
Benchmarks¶
def build_x_y(ntrain, nfeat):
X_train = np.empty((ntrain, nfeat))
X_train[:, :] = rand(ntrain, nfeat)[:, :]
X_trainsum = X_train.sum(axis=1)
eps = rand(ntrain) - 0.5
X_trainsum_ = X_trainsum + eps
y_train = (X_trainsum_ >= X_trainsum).ravel().astype(int)
return X_train, y_train
@ignore_warnings(category=(FutureWarning, DeprecationWarning))
def bench(n_obs, n_features, repeat=1000, verbose=False):
res = []
for n in n_obs:
for nfeat in n_features:
X_train, y_train = build_x_y(1000, nfeat)
obs = dict(n_obs=n, nfeat=nfeat)
fct1, fct2, fct3, fct4 = fcts_model(X_train, y_train)
# creates different inputs to avoid caching in any ways
Xs = []
Xpolys = []
for r in range(repeat):
X, y = build_x_y(n, nfeat)
Xs.append((X, y))
Xpolys.append((PolynomialFeatures().fit_transform(X), y))
# measure fct1
r = len(Xs)
st = time()
for X, y in Xpolys:
fct1(X, y)
end = time()
obs["time_sgd"] = (end - st) / r
res.append(obs)
# measures fct2
st = time()
for X, y in Xs:
fct2(X, y)
end = time()
obs["time_pipe_skl"] = (end - st) / r
res.append(obs)
# measures fct3
st = time()
for X, y in Xs:
fct3(X, y)
end = time()
obs["time_pipe_fast"] = (end - st) / r
res.append(obs)
# measures fct4
st = time()
for X, y in Xs:
fct4(X, y)
end = time()
obs["time_pipe_slow"] = (end - st) / r
res.append(obs)
if verbose and (len(res) % 1 == 0 or n >= 10000):
print("bench", len(res), ":", obs)
return res
Plots¶
def plot_results(df, verbose=False):
nrows = max(len(set(df.nfeat)), 2)
ncols = max(1, 2)
fig, ax = plt.subplots(nrows, ncols,
figsize=(nrows * 4, ncols * 4))
colors = "gbry"
row = 0
for nfeat in sorted(set(df.nfeat)):
pos = 0
for _ in range(1):
a = ax[row, pos]
if row == ax.shape[0] - 1:
a.set_xlabel("N observations", fontsize='x-small')
if pos == 0:
a.set_ylabel("Time (s) nfeat={}".format(nfeat),
fontsize='x-small')
subset = df[df.nfeat == nfeat]
if subset.shape[0] == 0:
continue
subset = subset.sort_values("n_obs")
if verbose:
print(subset)
label = "SGD"
subset.plot(x="n_obs", y="time_sgd", label=label, ax=a,
logx=True, logy=True, c=colors[0], style='--')
label = "SGD-SKL"
subset.plot(x="n_obs", y="time_pipe_skl", label=label, ax=a,
logx=True, logy=True, c=colors[1], style='--')
label = "SGD-FAST"
subset.plot(x="n_obs", y="time_pipe_fast", label=label, ax=a,
logx=True, logy=True, c=colors[2])
label = "SGD-SLOW"
subset.plot(x="n_obs", y="time_pipe_slow", label=label, ax=a,
logx=True, logy=True, c=colors[3])
a.legend(loc=0, fontsize='x-small')
if row == 0:
a.set_title("--", fontsize='x-small')
pos += 1
row += 1
plt.suptitle("Benchmark for Polynomial with SGDClassifier", fontsize=16)
Final function for the benchmark¶
def run_bench(repeat=100, verbose=False):
n_obs = [10, 100, 1000]
n_features = [5, 10, 50]
with sklearn.config_context(assume_finite=True):
start = time()
results = bench(n_obs, n_features, repeat=repeat, verbose=verbose)
end = time()
results_df = pandas.DataFrame(results)
print("Total time = %0.3f sec\n" % (end - start))
# plot the results
plot_results(results_df, verbose=verbose)
return results_df
Run the benchmark¶
print("numpy:", numpy.__version__)
print("scikit-learn:", sklearn.__version__)
df = run_bench(verbose=True)
print(df)
plt.show()

numpy: 1.23.5
scikit-learn: 1.2.1
bench 4 : {'n_obs': 10, 'nfeat': 5, 'time_sgd': 0.0008732647501165047, 'time_pipe_skl': 0.0012723311502486467, 'time_pipe_fast': 0.0011710939899785445, 'time_pipe_slow': 0.0019378169300034642}
bench 8 : {'n_obs': 10, 'nfeat': 10, 'time_sgd': 0.0008837558398954571, 'time_pipe_skl': 0.0014124184200773016, 'time_pipe_fast': 0.001251557560171932, 'time_pipe_slow': 0.003927701040520332}
bench 12 : {'n_obs': 10, 'nfeat': 50, 'time_sgd': 0.0010817546100588516, 'time_pipe_skl': 0.0028044814500026403, 'time_pipe_fast': 0.0025753787002759055, 'time_pipe_slow': 0.05836640598019585}
bench 16 : {'n_obs': 100, 'nfeat': 5, 'time_sgd': 0.0009438706201035529, 'time_pipe_skl': 0.0014009417401393874, 'time_pipe_fast': 0.0012248492403887212, 'time_pipe_slow': 0.0020254769397433847}
bench 20 : {'n_obs': 100, 'nfeat': 10, 'time_sgd': 0.0010062795900739729, 'time_pipe_skl': 0.0016947716200957075, 'time_pipe_fast': 0.0014989030302967876, 'time_pipe_slow': 0.004109605069970712}
bench 24 : {'n_obs': 100, 'nfeat': 50, 'time_sgd': 0.0020907312695635483, 'time_pipe_skl': 0.00506440918019507, 'time_pipe_fast': 0.004822482659947127, 'time_pipe_slow': 0.06490063289005775}
bench 28 : {'n_obs': 1000, 'nfeat': 5, 'time_sgd': 0.0016234472498763352, 'time_pipe_skl': 0.0024917460599681363, 'time_pipe_fast': 0.002250096729840152, 'time_pipe_slow': 0.002969293550122529}
bench 32 : {'n_obs': 1000, 'nfeat': 10, 'time_sgd': 0.002143393229926005, 'time_pipe_skl': 0.0037833680096082387, 'time_pipe_fast': 0.003548604310490191, 'time_pipe_slow': 0.006211826209910214}
bench 36 : {'n_obs': 1000, 'nfeat': 50, 'time_sgd': 0.010771384370164014, 'time_pipe_skl': 0.03161860232008621, 'time_pipe_fast': 0.03528651961009018, 'time_pipe_slow': 0.09823671941005159}
Total time = 48.343 sec
n_obs nfeat time_sgd time_pipe_skl time_pipe_fast time_pipe_slow
0 10 5 0.000873 0.001272 0.001171 0.001938
1 10 5 0.000873 0.001272 0.001171 0.001938
2 10 5 0.000873 0.001272 0.001171 0.001938
3 10 5 0.000873 0.001272 0.001171 0.001938
12 100 5 0.000944 0.001401 0.001225 0.002025
13 100 5 0.000944 0.001401 0.001225 0.002025
14 100 5 0.000944 0.001401 0.001225 0.002025
15 100 5 0.000944 0.001401 0.001225 0.002025
24 1000 5 0.001623 0.002492 0.002250 0.002969
25 1000 5 0.001623 0.002492 0.002250 0.002969
26 1000 5 0.001623 0.002492 0.002250 0.002969
27 1000 5 0.001623 0.002492 0.002250 0.002969
n_obs nfeat time_sgd time_pipe_skl time_pipe_fast time_pipe_slow
4 10 10 0.000884 0.001412 0.001252 0.003928
5 10 10 0.000884 0.001412 0.001252 0.003928
6 10 10 0.000884 0.001412 0.001252 0.003928
7 10 10 0.000884 0.001412 0.001252 0.003928
16 100 10 0.001006 0.001695 0.001499 0.004110
17 100 10 0.001006 0.001695 0.001499 0.004110
18 100 10 0.001006 0.001695 0.001499 0.004110
19 100 10 0.001006 0.001695 0.001499 0.004110
28 1000 10 0.002143 0.003783 0.003549 0.006212
29 1000 10 0.002143 0.003783 0.003549 0.006212
30 1000 10 0.002143 0.003783 0.003549 0.006212
31 1000 10 0.002143 0.003783 0.003549 0.006212
n_obs nfeat time_sgd time_pipe_skl time_pipe_fast time_pipe_slow
8 10 50 0.001082 0.002804 0.002575 0.058366
9 10 50 0.001082 0.002804 0.002575 0.058366
10 10 50 0.001082 0.002804 0.002575 0.058366
11 10 50 0.001082 0.002804 0.002575 0.058366
20 100 50 0.002091 0.005064 0.004822 0.064901
21 100 50 0.002091 0.005064 0.004822 0.064901
22 100 50 0.002091 0.005064 0.004822 0.064901
23 100 50 0.002091 0.005064 0.004822 0.064901
32 1000 50 0.010771 0.031619 0.035287 0.098237
33 1000 50 0.010771 0.031619 0.035287 0.098237
34 1000 50 0.010771 0.031619 0.035287 0.098237
35 1000 50 0.010771 0.031619 0.035287 0.098237
n_obs nfeat time_sgd time_pipe_skl time_pipe_fast time_pipe_slow
0 10 5 0.000873 0.001272 0.001171 0.001938
1 10 5 0.000873 0.001272 0.001171 0.001938
2 10 5 0.000873 0.001272 0.001171 0.001938
3 10 5 0.000873 0.001272 0.001171 0.001938
4 10 10 0.000884 0.001412 0.001252 0.003928
5 10 10 0.000884 0.001412 0.001252 0.003928
6 10 10 0.000884 0.001412 0.001252 0.003928
7 10 10 0.000884 0.001412 0.001252 0.003928
8 10 50 0.001082 0.002804 0.002575 0.058366
9 10 50 0.001082 0.002804 0.002575 0.058366
10 10 50 0.001082 0.002804 0.002575 0.058366
11 10 50 0.001082 0.002804 0.002575 0.058366
12 100 5 0.000944 0.001401 0.001225 0.002025
13 100 5 0.000944 0.001401 0.001225 0.002025
14 100 5 0.000944 0.001401 0.001225 0.002025
15 100 5 0.000944 0.001401 0.001225 0.002025
16 100 10 0.001006 0.001695 0.001499 0.004110
17 100 10 0.001006 0.001695 0.001499 0.004110
18 100 10 0.001006 0.001695 0.001499 0.004110
19 100 10 0.001006 0.001695 0.001499 0.004110
20 100 50 0.002091 0.005064 0.004822 0.064901
21 100 50 0.002091 0.005064 0.004822 0.064901
22 100 50 0.002091 0.005064 0.004822 0.064901
23 100 50 0.002091 0.005064 0.004822 0.064901
24 1000 5 0.001623 0.002492 0.002250 0.002969
25 1000 5 0.001623 0.002492 0.002250 0.002969
26 1000 5 0.001623 0.002492 0.002250 0.002969
27 1000 5 0.001623 0.002492 0.002250 0.002969
28 1000 10 0.002143 0.003783 0.003549 0.006212
29 1000 10 0.002143 0.003783 0.003549 0.006212
30 1000 10 0.002143 0.003783 0.003549 0.006212
31 1000 10 0.002143 0.003783 0.003549 0.006212
32 1000 50 0.010771 0.031619 0.035287 0.098237
33 1000 50 0.010771 0.031619 0.035287 0.098237
34 1000 50 0.010771 0.031619 0.035287 0.098237
35 1000 50 0.010771 0.031619 0.035287 0.098237
Total running time of the script: ( 0 minutes 52.315 seconds)