None

Faster Polynomial Features

Links: notebook, html, PDF, python, slides, GitHub

from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline

Polynomial Features

The current implementation of PolynomialFeatures (0.20.2) implements a term by term product for each pair X_i, X_j of features where i \leqslant j which is not the most efficient way to do it.

import numpy.random
X = numpy.random.random((100, 5))
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
Xpoly = poly.fit_transform(X)
poly.get_feature_names()
['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x0^2',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x0 x4',
 'x1^2',
 'x1 x2',
 'x1 x3',
 'x1 x4',
 'x2^2',
 'x2 x3',
 'x2 x4',
 'x3^2',
 'x3 x4',
 'x4^2']
%timeit poly.transform(X)
114 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

The class ExtendedFeatures implements a different way to compute the polynomial features as it tries to reduce the number of calls to numpy by using broacasted vector multplications.

from mlinsights.mlmodel import ExtendedFeatures
ext = ExtendedFeatures(poly_degree=2)
Xpoly = ext.fit_transform(X)
ext.get_feature_names()
['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x0^2',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x0 x4',
 'x1^2',
 'x1 x2',
 'x1 x3',
 'x1 x4',
 'x2^2',
 'x2 x3',
 'x2 x4',
 'x3^2',
 'x3 x4',
 'x4^2']
%timeit ext.transform(X)
68.7 µs ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

Comparison with 5 features

from cpyquickhelper.numbers import measure_time
res = []
for n in [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000,
          5000, 10000, 20000, 50000, 100000, 200000]:
    X = numpy.random.random((n, 5))
    poly.fit(X)
    ext.fit(X)
    r1 = measure_time("poly.transform(X)", context=dict(X=X, poly=poly), repeat=5, number=10, div_by_number=True)
    r2 = measure_time("ext.transform(X)", context=dict(X=X, ext=ext), repeat=5, number=10, div_by_number=True)
    r3 = measure_time("poly.fit_transform(X)", context=dict(X=X, poly=poly), repeat=5, number=10, div_by_number=True)
    r4 = measure_time("ext.fit_transform(X)", context=dict(X=X, ext=ext), repeat=5, number=10, div_by_number=True)
    r1["name"] = "poly"
    r2["name"] = "ext"
    r3["name"] = "poly+fit"
    r4["name"] = "ext+fit"
    r1["size"] = n
    r2["size"] = n
    r3["size"] = n
    r4["size"] = n
    res.append(r1)
    res.append(r2)
    res.append(r3)
    res.append(r4)

import pandas
df = pandas.DataFrame(res)
df.tail()
average deviation min_exec max_exec repeat number context_size name size
63 0.037830 0.005577 0.031248 0.044832 5 10 240 ext+fit 100000
64 0.072671 0.005360 0.067559 0.082539 5 10 240 poly 200000
65 0.075712 0.018271 0.060476 0.100143 5 10 240 ext 200000
66 0.106755 0.019861 0.079880 0.139184 5 10 240 poly+fit 200000
67 0.074090 0.009142 0.063925 0.085899 5 10 240 ext+fit 200000
piv = df.pivot("size", "name", "average")
piv[:5]
name ext ext+fit poly poly+fit
size
1 0.000068 0.000402 0.000238 0.000275
2 0.000066 0.000156 0.000166 0.000213
5 0.000031 0.000427 0.000165 0.000196
10 0.000048 0.000237 0.000134 0.000306
20 0.000070 0.000188 0.000109 0.000153
ax = piv.plot(logy=True, logx=True)
ax.set_title("Polynomial Features for 5 features\ndegree=2")
ax.set_ylabel("seconds")
ax.set_xlabel("number of observations");
../_images/faster_polynomial_features_14_0.png

The gain is mostly visible for small dimensions.

Comparison with 1000 observations

In this experiment, the number of observations is fixed to 1000 but the number of features varies.

poly = PolynomialFeatures(degree=2)
ext = ExtendedFeatures(poly_degree=2)
# implementation of PolynomialFeatures in 0.20.2
extslow = ExtendedFeatures(poly_degree=2, kind="poly-slow")


res = []
for n in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 40, 50]:
    X = numpy.random.random((1000, n))
    poly.fit(X)
    ext.fit(X)
    extslow.fit(X)
    r1 = measure_time("poly.transform(X)", context=dict(X=X, poly=poly), repeat=5, number=30, div_by_number=True)
    r2 = measure_time("ext.transform(X)", context=dict(X=X, ext=ext), repeat=5, number=30, div_by_number=True)
    r3 = measure_time("extslow.transform(X)", context=dict(X=X, extslow=extslow), repeat=5, number=30, div_by_number=True)
    r1["name"] = "poly"
    r2["name"] = "ext"
    r3["name"] = "extslow"
    r1["nfeat"] = n
    r2["nfeat"] = n
    r3["nfeat"] = n
    x1 = poly.transform(X)
    x2 = ext.transform(X)
    x3 = extslow.transform(X)
    r1["numf"] = x1.shape[1]
    r2["numf"] = x2.shape[1]
    r3["numf"] = x3.shape[1]
    res.append(r1)
    res.append(r2)
    res.append(r3)

import pandas
df = pandas.DataFrame(res)
df.tail()
average deviation min_exec max_exec repeat number context_size name nfeat numf
37 0.009331 0.001603 0.008280 0.012519 5 30 240 ext 40 861
38 0.022619 0.002868 0.018793 0.026324 5 30 240 extslow 40 861
39 0.013188 0.000370 0.012828 0.013888 5 30 240 poly 50 1326
40 0.012817 0.000102 0.012700 0.012951 5 30 240 ext 50 1326
41 0.030384 0.000717 0.029955 0.031813 5 30 240 extslow 50 1326
piv = df.pivot("nfeat", "name", "average")
piv[:5]
name ext extslow poly
nfeat
1 0.000026 0.000059 0.000152
2 0.000055 0.000100 0.000113
3 0.000161 0.000381 0.000237
4 0.000148 0.000221 0.000219
5 0.000185 0.000340 0.000236
ax = piv.plot(logy=True, logx=True)
ax.set_title("Polynomial Features for 1000 observations\ndegree=2")
ax.set_ylabel("seconds")
ax.set_xlabel("number of features");
../_images/faster_polynomial_features_19_0.png

It is twice faster.

Comparison for different degrees

In this experiment, the number of observations and features is fixed, the degree increases.

res = []
for n in [2, 3, 4, 5, 6, 7, 8]:
    X = numpy.random.random((1000, 4))
    poly = PolynomialFeatures(degree=n)
    ext = ExtendedFeatures(poly_degree=n)
    poly.fit(X)
    ext.fit(X)
    r1 = measure_time("poly.transform(X)", context=dict(X=X, poly=poly), repeat=5, number=30, div_by_number=True)
    r2 = measure_time("ext.transform(X)", context=dict(X=X, ext=ext), repeat=5, number=30, div_by_number=True)
    r1["name"] = "poly"
    r2["name"] = "ext"
    r1["degree"] = n
    r2["degree"] = n
    x1 = poly.transform(X)
    x2 = ext.transform(X)
    r1["numf"] = x1.shape[1]
    r2["numf"] = x2.shape[1]
    res.append(r1)
    res.append(r2)

import pandas
df = pandas.DataFrame(res)
df.tail()
average deviation min_exec max_exec repeat number context_size name degree numf
9 0.001960 0.000067 0.001915 0.002094 5 30 240 ext 6 210
10 0.003131 0.000118 0.003009 0.003327 5 30 240 poly 7 330
11 0.003076 0.000233 0.002845 0.003393 5 30 240 ext 7 330
12 0.004299 0.000046 0.004243 0.004367 5 30 240 poly 8 495
13 0.004157 0.000035 0.004114 0.004217 5 30 240 ext 8 495
piv = df.pivot("degree", "name", "average")
piv[:5]
name ext poly
degree
2 0.000140 0.000312
3 0.000304 0.000363
4 0.000506 0.000579
5 0.000715 0.000789
6 0.001960 0.002032
ax = piv.plot(logy=True, logx=True)
ax.set_title("Polynomial Features for 1000 observations\nnumber of features is 4")
ax.set_ylabel("seconds")
ax.set_xlabel("degree");
../_images/faster_polynomial_features_24_0.png

It is worth transposing.

Same experiment with interaction_only=True

res = []
for n in [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000,
          5000, 10000, 20000, 50000, 100000, 200000]:
    poly = PolynomialFeatures(degree=2, interaction_only=True)
    ext = ExtendedFeatures(poly_degree=2, poly_interaction_only=True)
    X = numpy.random.random((n, 5))
    poly.fit(X)
    ext.fit(X)
    r1 = measure_time("poly.transform(X)", context=dict(X=X, poly=poly), repeat=2, number=30, div_by_number=True)
    r2 = measure_time("ext.transform(X)", context=dict(X=X, ext=ext), repeat=2, number=30, div_by_number=True)
    r1["name"] = "poly"
    r2["name"] = "ext"
    r1["size"] = n
    r2["size"] = n
    res.append(r1)
    res.append(r2)

import pandas
df = pandas.DataFrame(res)
df.tail()
average deviation min_exec max_exec repeat number context_size name size
29 0.010691 0.000073 0.010618 0.010764 2 30 240 ext 50000
30 0.026612 0.000794 0.025817 0.027406 2 30 240 poly 100000
31 0.025052 0.001583 0.023469 0.026635 2 30 240 ext 100000
32 0.058772 0.001345 0.057427 0.060118 2 30 240 poly 200000
33 0.054771 0.004555 0.050216 0.059327 2 30 240 ext 200000
piv = df.pivot("size", "name", "average")
piv[:5]
name ext poly
size
1 0.000042 0.000086
2 0.000034 0.000104
5 0.000068 0.000089
10 0.000032 0.000092
20 0.000040 0.000103
ax = piv.plot(logy=True, logx=True)
ax.set_title("Polynomial Features for 5 features\ndegree is 2 + interaction_only=True")
ax.set_ylabel("seconds")
ax.set_xlabel("N obs");
../_images/faster_polynomial_features_29_0.png

Memory profiler

from memory_profiler import memory_usage
poly = PolynomialFeatures(degree=2, interaction_only=True)
poly.fit(X)
memory_usage((poly.transform, (X,)), interval=0.1, max_usage=True)
258.02734375
def pick_value(v):
    try:
        return v[0]
    except TypeError:
        return v

res = []
for n in [10000, 50000, 100000, 200000]:
    X = numpy.random.random((n, 50))
    print(n)
    poly = PolynomialFeatures(degree=2, interaction_only=True)
    ext = ExtendedFeatures(poly_degree=2, poly_interaction_only=True)
    poly.fit(X)
    ext.fit(X)
    r1 = memory_usage((poly.transform, (X,)), interval=0.1, max_usage=True)
    r2 = memory_usage((ext.transform, (X,)), interval=0.1, max_usage=True)
    r1 = {"memory": pick_value(r1)}
    r2 = {"memory": pick_value(r2)}
    r1["name"] = "poly"
    r2["name"] = "ext"
    r1["size"] = n
    r2["size"] = n
    res.append(r1)
    res.append(r2)

import pandas
df = pandas.DataFrame(res)
df.tail()
10000
50000
100000
200000
memory name size
3 699.679688 ext 50000
4 1243.664062 poly 100000
5 1205.515625 ext 100000
6 1952.316406 poly 200000
7 2029.765625 ext 200000
piv = df.pivot("size", "name", "memory")
piv[:5]
name ext poly
size
10000 392.445312 396.347656
50000 699.679688 718.839844
100000 1205.515625 1243.664062
200000 2029.765625 1952.316406
ax = piv.plot(logy=True, logx=True)
ax.set_title("Polynomial Features for 50 features\ndegree is 2 - memory")
ax.set_ylabel("Mb")
ax.set_xlabel("N obs");
../_images/faster_polynomial_features_34_0.png