.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "gyexamples/plot_bench_polynomial_features_partial_fit.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download_gyexamples_plot_bench_polynomial_features_partial_fit.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_gyexamples_plot_bench_polynomial_features_partial_fit.py:


.. _l-bench-slk-poly:

Benchmark of PolynomialFeatures + partialfit of SGDClassifier
=============================================================

This benchmark looks into a new implementation of
`PolynomialFeatures <https://scikit-learn.org/stable/
modules/generated/sklearn.preprocessing.PolynomialFeatures.html>`_
proposed in `PR13290 <https://github.com/
scikit-learn/scikit-learn/pull/13290>`_.
It tests the following configurations:

* **SGD-ONLY**: :epkg:`sklearn:linear_model:SGDClassifier` only
* **SGD-SKL**: :epkg:`sklearn:preprocessing:PolynomialFeature`
  from :epkg:`scikit-learn` (no matter what it is)
* **SGD-FAST**: new implementation copy-pasted in the
  benchmark source file
* **SGD-SLOW**: implementation of 0.20.2 copy-pasted
  in the benchmark source file

This example takes the example :ref:`l-bench-slk-poly-standalone`
and rewrites it with module :epkg:`pymlbenchmark`.

.. contents::
    :local:

.. GENERATED FROM PYTHON SOURCE LINES 29-46

.. code-block:: default

    from pymlbenchmark.plotting import plot_bench_results
    from pymlbenchmark.context import machine_information
    from time import perf_counter as time

    import matplotlib.pyplot as plt
    import pandas
    import sklearn
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import SGDClassifier
    try:
        from sklearn.utils._testing import ignore_warnings
    except ImportError:
        from sklearn.utils.testing import ignore_warnings
    from mlinsights.mlmodel import ExtendedFeatures


.. GENERATED FROM PYTHON SOURCE LINES 47-49

Implementation to benchmark
+++++++++++++++++++++++++++

.. GENERATED FROM PYTHON SOURCE LINES 49-110

.. code-block:: default


    from pymlbenchmark.benchmark import BenchPerf, BenchPerfTest
    from pymlbenchmark.datasets import random_binary_classification


    class PolyBenchPerfTest(BenchPerfTest):
        def __init__(self, dim=None, **opts):
            # Models are fitted here. Every not measured
            # should take place here.
            assert dim is not None
            BenchPerfTest.__init__(self, **opts)
            self.model1 = SGDClassifier()
            self.model2 = make_pipeline(PolynomialFeatures(), SGDClassifier())
            self.model3 = make_pipeline(
                ExtendedFeatures(kind='poly'), SGDClassifier())
            self.model4 = make_pipeline(ExtendedFeatures(
                kind='poly-slow'), SGDClassifier())
            X, y = random_binary_classification(10000, dim)
            self.model1.fit(PolynomialFeatures().fit_transform(X), y)
            self.model2.fit(X, y)
            self.model3.fit(X, y)
            self.model4.fit(X, y)

        def data(self, N=None, dim=None):
            # The benchmark requires a new datasets each time.
            assert N is not None
            assert dim is not None
            return random_binary_classification(N, dim)

        def fcts(self, dim=None, **kwargs):
            # The function returns the prediction functions to tests.

            def preprocess(X, y):
                return PolynomialFeatures().fit_transform(X), y

            def partial_fit_model1(X, y, model=self.model1):
                return model.partial_fit(X, y)

            def partial_fit_model2(X, y, model=self.model2):
                X2 = model.steps[0][1].transform(X)
                return model.steps[1][1].partial_fit(X2, y)

            def partial_fit_model3(X, y, model=self.model3):
                X2 = model.steps[0][1].transform(X)
                return model.steps[1][1].partial_fit(X2, y)

            def partial_fit_model4(X, y, model=self.model4):
                X2 = model.steps[0][1].transform(X)
                return model.steps[1][1].partial_fit(X2, y)

            return [{'test': 'SGD-ONLY', 'fct': (preprocess, partial_fit_model1)},
                    {'test': 'SGD-SKL', 'fct': partial_fit_model2},
                    {'test': 'SGD-FAST', 'fct': partial_fit_model3},
                    {'test': 'SGD-SLOW', 'fct': partial_fit_model4}]

        def validate(self, results, **kwargs):
            for ind, row, model in results:
                assert isinstance(row, dict)  # test options
                assert isinstance(model, SGDClassifier)  # trained model


.. GENERATED FROM PYTHON SOURCE LINES 111-113

Benchmark function
++++++++++++++++++

.. GENERATED FROM PYTHON SOURCE LINES 113-129

.. code-block:: default


    @ignore_warnings(category=(FutureWarning, DeprecationWarning))
    def run_bench(repeat=100, verbose=False):
        pbefore = dict(dim=[5, 10, 50])
        pafter = dict(N=[10, 100, 1000])
        bp = BenchPerf(pbefore, pafter, PolyBenchPerfTest)

        with sklearn.config_context(assume_finite=True):
            start = time()
            results = list(bp.enumerate_run_benchs(repeat=repeat, verbose=verbose))
            end = time()

        results_df = pandas.DataFrame(results)
        print("Total time = %0.3f sec\n" % (end - start))
        return results_df


.. GENERATED FROM PYTHON SOURCE LINES 130-132

Run the benchmark
+++++++++++++++++

.. GENERATED FROM PYTHON SOURCE LINES 132-138

.. code-block:: default


    df = run_bench(verbose=True)
    df.to_csv("plot_bench_polynomial_features_partial_fit.perf.csv", index=False)
    print(df.head())


.. rst-class:: sphx-glr-script-out

 .. code-block:: none


      0%|          | 0/9 [00:00<?, ?it/s]
     11%|#1        | 1/9 [00:02<00:22,  2.87s/it]
     22%|##2       | 2/9 [00:04<00:15,  2.15s/it]
     33%|###3      | 3/9 [00:08<00:17,  2.99s/it]
     44%|####4     | 4/9 [00:09<00:11,  2.39s/it]
     56%|#####5    | 5/9 [00:12<00:10,  2.52s/it]
     67%|######6   | 6/9 [01:56<01:50, 36.96s/it]
     78%|#######7  | 7/9 [02:09<00:58, 29.05s/it]
     89%|########8 | 8/9 [02:38<00:29, 29.23s/it]
     89%|########8 | 8/9 [02:38<00:19, 19.87s/it]
    Total time = 158.924 sec

           test    N  dim  repeat  ...     upper  count    median  error_c
    0  SGD-ONLY   10    5     100  ...  0.001467    100  0.001397        0
    1   SGD-SKL   10    5     100  ...  0.002015    100  0.001954        0
    2  SGD-FAST   10    5     100  ...  0.001764    100  0.001666        0
    3  SGD-SLOW   10    5     100  ...  0.003053    100  0.002994        0
    4  SGD-ONLY  100    5     100  ...  0.001584    100  0.001511        0

    [5 rows x 15 columns]


.. GENERATED FROM PYTHON SOURCE LINES 139-141

Extract information about the machine used
++++++++++++++++++++++++++++++++++++++++++

.. GENERATED FROM PYTHON SOURCE LINES 141-147

.. code-block:: default


    pkgs = ['numpy', 'pandas', 'sklearn']
    dfi = pandas.DataFrame(machine_information(pkgs))
    dfi.to_csv("plot_bench_polynomial_features_partial_fit.time.csv", index=False)
    print(dfi)


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

                             name  ...                                              value
    0                        date  ...                                                NaN
    1                      python  ...  3.9.1 (default, Jan 18 2021, 16:35:58) \n[GCC ...
    2                    platform  ...                                              linux
    3                          OS  ...        Linux-4.19.0-23-amd64-x86_64-with-glibc2.28
    4                     machine  ...                                             x86_64
    5                   processor  ...                                                   
    6                     release  ...                                    4.19.0-23-amd64
    7                architecture  ...                                       (64bit, ELF)
    8                        arch  ...                                             X86_64
    9                   brand_raw  ...            Intel(R) Atom(TM) CPU  C2750  @ 2.40GHz
    10                      count  ...                                                  8
    11                      flags  ...  3dnowprefetch acpi aes aperfmperf apic arat ar...
    12              hz_advertised  ...                                    [2400000000, 0]
    13         l1_data_cache_size  ...                                              24576
    14  l1_instruction_cache_size  ...                                              32768
    15     l2_cache_associativity  ...                                                  8
    16         l2_cache_line_size  ...                                               1024
    17              l2_cache_size  ...                                            1048576
    18              l3_cache_size  ...                                            1048576
    19                   stepping  ...                                                  8
    20                      numpy  ...                               openblas, language=c
    21                     pandas  ...                                                NaN
    22                    sklearn  ...                                                NaN

    [23 rows x 3 columns]


.. GENERATED FROM PYTHON SOURCE LINES 148-150

Plot the results
++++++++++++++++

.. GENERATED FROM PYTHON SOURCE LINES 150-158

.. code-block:: default


    print(df.columns)
    plot_bench_results(df, row_cols='N', col_cols=None,
                       x_value='dim', hue_cols=None,
                       cmp_col_values='test',
                       title="PolynomialFeatures + partial_fit\n"
                       "Benchmark scikit-learn PR13290")
    plt.show()


.. image-sg:: /gyexamples/images/sphx_glr_plot_bench_polynomial_features_partial_fit_001.png
   :alt: PolynomialFeatures + partial_fit Benchmark scikit-learn PR13290, -
   :srcset: /gyexamples/images/sphx_glr_plot_bench_polynomial_features_partial_fit_001.png
   :class: sphx-glr-single-img


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    Index(['test', 'N', 'dim', 'repeat', 'number', 'min', 'max', 'min3', 'max3',
           'mean', 'lower', 'upper', 'count', 'median', 'error_c'],
          dtype='object')
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:143: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      piv = ds.pivot(*y_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      lower_piv = ds.pivot(*lower_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:164: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      upper_piv = ds.pivot(*upper_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:143: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      piv = ds.pivot(*y_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      lower_piv = ds.pivot(*lower_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:164: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      upper_piv = ds.pivot(*upper_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:143: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      piv = ds.pivot(*y_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:153: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      lower_piv = ds.pivot(*lower_cols)
    somewhere/workspace/pymlbenchmark/pymlbenchmark_UT_39_std/_doc/sphinxdoc/source/pymlbenchmark/plotting/plot_bench_results.py:164: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
      upper_piv = ds.pivot(*upper_cols)


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 2 minutes  46.584 seconds)


.. _sphx_glr_download_gyexamples_plot_bench_polynomial_features_partial_fit.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example


    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: plot_bench_polynomial_features_partial_fit.py <plot_bench_polynomial_features_partial_fit.py>`

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: plot_bench_polynomial_features_partial_fit.ipynb <plot_bench_polynomial_features_partial_fit.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_