.. _onnxfftsrst:

=========
ONNX FFTs
=========


.. only:: html

    **Links:** :download:`notebook <onnx_ffts.ipynb>`, :downloadlink:`html <onnx_ffts2html.html>`, :download:`PDF <onnx_ffts.pdf>`, :download:`python <onnx_ffts.py>`, :downloadlink:`slides <onnx_ffts.slides.html>`, :githublink:`GitHub|_doc/notebooks/onnx_ffts.ipynb|*`


Implementation of a couple of variations of FFT (see
`FFT <https://www.tensorflow.org/xla/operation_semantics#fft>`__ in
ONNX.

.. code:: ipython3

    from jyquickhelper import add_notebook_menu
    add_notebook_menu()


.. contents::
    :local:


.. code:: ipython3

    %matplotlib inline

.. code:: ipython3

    %load_ext mlprodict

Signature
---------

We try to use function
`FFT <https://www.tensorflow.org/xla/operation_semantics#fft>`__ or
`torch.fft.fftn <https://pytorch.org/docs/stable/generated/torch.fft.fftn.html#torch.fft.fftn>`__.

.. code:: ipython3

    import numpy
    from numpy.testing import assert_almost_equal
    
    def numpy_fftn(x, fft_type, fft_length, axes):
        """
        Implements FFT
    
        :param x: input
        :param fft_type: string (see below)
        :param fft_length: length on each axis of axes
        :param axes: axes
        :return: result
        
        * `'FFT`': complex-to-complex FFT. Shape is unchanged.
        * `'IFFT`': Inverse complex-to-complex FFT. Shape is unchanged.
        * `'RFFT`': Forward real-to-complex FFT.
          Shape of the innermost axis is reduced to fft_length[-1] // 2 + 1 if fft_length[-1]
          is a non-zero value, omitting the reversed conjugate part of 
          the transformed signal beyond the Nyquist frequency.
        * `'IRFFT`': Inverse real-to-complex FFT (ie takes complex, returns real).
          Shape of the innermost axis is expanded to fft_length[-1] if fft_length[-1] 
          is a non-zero value, inferring the part of the transformed signal beyond the Nyquist
          frequency from the reverse conjugate of the 1 to fft_length[-1] // 2 + 1 entries.
        """
        if fft_type == 'FFT':
            return numpy.fft.fftn(x, fft_length, axes=axes)
        raise NotImplementedError("Not implemented for fft_type=%r." % fft_type)
        
    
    def test_fct(fct1, fct2, fft_type='FFT', decimal=5):
        cases = list(range(4, 20))
        dims = [[c] for c in cases] + [[4,4,4,4], [4,5,6,7]]
        lengths_axes = [([c], [0]) for c in cases] + [
            ([2, 2, 2, 2], None), ([2, 6, 7, 2], None), ([2, 3, 4, 5], None),
            ([2], [3]), ([3], [2])]
        n_test = 0
        for ndim in range(1, 5):
            for dim in dims:
                for length, axes in lengths_axes:
                    if axes is None:
                        axes = range(ndim)
                    di = dim[:ndim]
                    axes = [min(len(di) - 1, a) for a in axes]
                    le = length[:ndim]
                    if len(length) > len(di):
                        continue
                    mat = numpy.random.randn(*di).astype(numpy.float32)
                    try:
                        v1 = fct1(mat, fft_type, le, axes=axes)
                    except Exception as e:
                        raise AssertionError(
                            "Unable to run %r mat.shape=%r ndim=%r di=%r fft_type=%r le=%r "
                            "axes=%r exc=%r" %(
                                fct1, mat.shape, ndim, di, fft_type, le, axes, e))
                    v2 = fct2(mat, fft_type, le, axes=axes)
                    try:
                        assert_almost_equal(v1, v2, decimal=decimal)
                    except AssertionError as e:
                        raise AssertionError(
                            "Failure mat.shape=%r, fft_type=%r, fft_length=%r" % (
                                mat.shape, fft_type, le)) from e
                    n_test += 1
        return n_test
    
    
    test_fct(numpy_fftn, numpy_fftn)


.. parsed-literal::
    1302


.. code:: ipython3

    %timeit -n 1 -r 1 test_fct(numpy_fftn, numpy_fftn)


.. parsed-literal::
    1.81 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


.. code:: ipython3

    import torch
    
    def torch_fftn(x, fft_type, fft_length, axes):
        xt = torch.tensor(x)
        if fft_type == 'FFT':
            return torch.fft.fftn(xt, fft_length, axes).cpu().detach().numpy()
        
    %timeit -n 1 -r 1 test_fct(numpy_fftn, torch_fftn)


.. parsed-literal::
    2.07 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Numpy implementation
--------------------

.. code:: ipython3

    import numpy
    
    
    def _dft_cst(N, fft_length, dtype):
        def _arange(dim, dtype, resh):
            return numpy.arange(dim).astype(dtype).reshape(resh)
    
        def _prod(n, k):
            return (-2j * numpy.pi * k / fft_length) * n
    
        def _exp(m):
            return numpy.exp(m)
        
        n = _arange(N, dtype, (-1, 1))
        k = _arange(fft_length, dtype, (1, -1))
        M = _exp(_prod(n, k))
        return M
    
    
    def custom_fft(x, fft_type, length, axis, dft_fct=None):
        # https://github.com/numpy/numpy/blob/4adc87dff15a247e417d50f10cc4def8e1c17a03/numpy/fft/_pocketfft.py#L56
        if dft_fct is None:
            dft_fct = _dft_cst
        if fft_type == 'FFT':
            if x.shape[axis] > length:
                # fft_length > shape on the same axis
                # the matrix is shortened
                slices = [slice(None)] * len(x.shape)
                slices[axis] = slice(0, length)
                new_x = x[tuple(slices)]
            elif x.shape[axis] == length:
                new_x = x
            else:
                # other, the matrix is completed with zeros
                shape = list(x.shape)
                shape[axis] = length
                slices = [slice(None)] * len(x.shape)
                slices[axis] = slice(0, length)
                zeros = numpy.zeros(tuple(shape), dtype=x.dtype)
                index = [slice(0, i) for i in x.shape]
                zeros[tuple(index)] = x
                new_x = zeros
    
            cst = dft_fct(new_x.shape[axis], length, x.dtype)
            perm = numpy.arange(len(x.shape)).tolist()        
            if perm[axis] == perm[-1]:
                res = numpy.matmul(new_x, cst).transpose(perm)
            else:
                perm[axis], perm[-1] = perm[-1], perm[axis]            
                rest = new_x.transpose(perm)
                res = numpy.matmul(rest, cst).transpose(perm)
                perm[axis], perm[0] = perm[0], perm[axis]
            return res
        raise ValueError("Unexpected value for fft_type=%r." % fft_type)
    
    
    def custom_fftn(x, fft_type, fft_length, axes, dft_fct=None):
        if len(axes) != len(fft_length):
            raise ValueError("Length mismatch axes=%r, fft_length=%r." % (
               axes, fft_length))
        if fft_type == 'FFT':
            res = x
            for i in range(len(fft_length) - 1, -1, -1):
                length = fft_length[i]
                axis = axes[i]
                res = custom_fft(res, fft_type, length, axis, dft_fct=dft_fct)
            return res
        raise ValueError("Unexpected value for fft_type=%r." % fft_type)
    
        
    shape = (4, )
    fft_length = [5,]
    axes = [0]
    rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    custom_fftn(rnd, 'FFT', fft_length, axes), numpy_fftn(rnd, 'FFT', fft_length, axes)
    assert_almost_equal(custom_fftn(rnd, 'FFT', fft_length, axes),
                        numpy_fftn(rnd, 'FFT', fft_length, axes), decimal=5)
    
    shape = (4, 3)
    fft_length = [3, 2]
    axes = [0, 1]
    rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    custom_fftn(rnd, 'FFT', fft_length, axes), numpy_fftn(rnd, 'FFT', fft_length, axes)
    assert_almost_equal(custom_fftn(rnd, 'FFT', fft_length, axes),
                        numpy_fftn(rnd, 'FFT', fft_length, axes), decimal=5)

.. code:: ipython3

    %timeit -n 1 -r 1 test_fct(numpy_fftn, custom_fftn, decimal=4)


.. parsed-literal::
    2.35 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Benchmark
---------

.. code:: ipython3

    from cpyquickhelper.numbers.speed_measure import measure_time
    from tqdm import tqdm
    from pandas import DataFrame
    
    def benchmark(fcts, power2=False):
        axes = [1]
        if power2:
            shape = [512, 1024]
            lengths = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
        else:
            shape = [512, 150]
            lengths = list(range(8, 200, 8))
        rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    
        data = []
        for length in tqdm(lengths):
            fft_length = [length]
            for name, fct in fcts.items():
                obs = measure_time(lambda: fct(rnd, 'FFT', fft_length, axes),
                                   repeat=5, number=5)
                obs['name'] = name
                obs['length'] = length
                data.append(obs)
    
        df = DataFrame(data)
        return df
    
    
    df = benchmark({'numpy_fftn': numpy_fftn, 'custom_fftn': custom_fftn, 'torch_fftn': torch_fftn})
    piv = df.pivot("length", "name", "average")
    piv[:5]


.. parsed-literal::
    100%|██████████| 24/24 [00:06<00:00,  3.91it/s]


.. raw:: html

    <div>
    <style scoped>
        .dataframe tbody tr th:only-of-type {
            vertical-align: middle;
        }

        .dataframe tbody tr th {
            vertical-align: top;
        }

        .dataframe thead th {
            text-align: right;
        }
    </style>
    <table border="1" class="dataframe">
      <thead>
        <tr style="text-align: right;">
          <th>name</th>
          <th>custom_fftn</th>
          <th>numpy_fftn</th>
          <th>torch_fftn</th>
        </tr>
        <tr>
          <th>length</th>
          <th></th>
          <th></th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <th>8</th>
          <td>0.000585</td>
          <td>0.000911</td>
          <td>0.003643</td>
        </tr>
        <tr>
          <th>16</th>
          <td>0.001669</td>
          <td>0.001373</td>
          <td>0.004087</td>
        </tr>
        <tr>
          <th>24</th>
          <td>0.002682</td>
          <td>0.003273</td>
          <td>0.005745</td>
        </tr>
        <tr>
          <th>32</th>
          <td>0.004288</td>
          <td>0.003275</td>
          <td>0.004657</td>
        </tr>
        <tr>
          <th>40</th>
          <td>0.004818</td>
          <td>0.003831</td>
          <td>0.005198</td>
        </tr>
      </tbody>
    </table>
    </div>


.. code:: ipython3

    piv.plot(logy=True, logx=True, title="FFT benchmark", figsize=(12, 4));


.. image:: onnx_ffts_13_0.png


.. code:: ipython3

    df = benchmark({'numpy_fftn': numpy_fftn, 'custom_fftn': custom_fftn, 'torch_fftn': torch_fftn},
                  power2=True)
    piv = df.pivot("length", "name", "average")
    piv


.. parsed-literal::
    100%|██████████| 10/10 [00:13<00:00,  1.33s/it]


.. raw:: html

    <div>
    <style scoped>
        .dataframe tbody tr th:only-of-type {
            vertical-align: middle;
        }

        .dataframe tbody tr th {
            vertical-align: top;
        }

        .dataframe thead th {
            text-align: right;
        }
    </style>
    <table border="1" class="dataframe">
      <thead>
        <tr style="text-align: right;">
          <th>name</th>
          <th>custom_fftn</th>
          <th>numpy_fftn</th>
          <th>torch_fftn</th>
        </tr>
        <tr>
          <th>length</th>
          <th></th>
          <th></th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <th>2</th>
          <td>0.000434</td>
          <td>0.001167</td>
          <td>0.023980</td>
        </tr>
        <tr>
          <th>4</th>
          <td>0.001117</td>
          <td>0.001671</td>
          <td>0.022530</td>
        </tr>
        <tr>
          <th>8</th>
          <td>0.001428</td>
          <td>0.002077</td>
          <td>0.022102</td>
        </tr>
        <tr>
          <th>16</th>
          <td>0.004654</td>
          <td>0.002874</td>
          <td>0.019792</td>
        </tr>
        <tr>
          <th>32</th>
          <td>0.003172</td>
          <td>0.002689</td>
          <td>0.017474</td>
        </tr>
        <tr>
          <th>64</th>
          <td>0.006966</td>
          <td>0.004612</td>
          <td>0.018116</td>
        </tr>
        <tr>
          <th>128</th>
          <td>0.030904</td>
          <td>0.011608</td>
          <td>0.023369</td>
        </tr>
        <tr>
          <th>256</th>
          <td>0.123821</td>
          <td>0.025853</td>
          <td>0.023532</td>
        </tr>
        <tr>
          <th>512</th>
          <td>0.476802</td>
          <td>0.043352</td>
          <td>0.033228</td>
        </tr>
        <tr>
          <th>1024</th>
          <td>1.527917</td>
          <td>0.109868</td>
          <td>0.052858</td>
        </tr>
      </tbody>
    </table>
    </div>


.. code:: ipython3

    piv.plot(logy=True, logx=True, title="FFT benchmark (power2)", figsize=(12, 4));


.. image:: onnx_ffts_15_0.png


Profiling
---------

.. code:: ipython3

    from pyquickhelper.pycode.profiling import profile2graph, profile
    
    shape = [512, 128]
    fft_length = [128]
    axes = [1]
    rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    
    def f():
        for i in range(100):
            custom_fftn(rnd, 'FFT', fft_length, axes)
    
    stat, text = profile(f)
    gr = profile2graph(stat)
    print(gr[0].to_text(fct_width=40))


.. parsed-literal::
    f                                        --    1    1 -- 0.01752 0.54515 -- <ipython-input-81-3ee1763130c2>:8:f (f)
        custom_fftn                          --  100  100 -- 0.00234 0.52763 -- <ipython-input-7-85a4c9f552d3>:57:custom_fftn (custom_fftn)
            custom_fft                       --  100  100 -- 0.19936 0.52516 -- <ipython-input-7-85a4c9f552d3>:20:custom_fft (custom_fft)
                _dft_cst                     --  100  100 -- 0.31917 0.32366 -- <ipython-input-61-afe90fb073f9>:4:_dft_cst (_dft_cst)
                    _arange                  --  200  200 -- 0.00088 0.00449 -- <ipython-input-61-afe90fb073f9>:5:_arange (_arange)
                        <method '...objects> --  200  200 -- 0.00128 0.00128 -- ~:0:<method 'astype' of 'numpy.ndarray' objects> (<method 'astype' of 'numpy.ndarray' objects>)
                        <method '...objects> --  200  200 -- 0.00064 0.00064 -- ~:0:<method 'reshape' of 'numpy.ndarray' objects> (<method 'reshape' of 'numpy.ndarray' objects>)
                        <built-in....arange> --  200  200 -- 0.00169 0.00169 -- ~:0:<built-in method numpy.arange> (<built-in method numpy.arange>) +++
                <built-in met...uiltins.len> --  100  100 -- 0.00011 0.00011 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
                <method 'toli...ay' objects> --  100  100 -- 0.00024 0.00024 -- ~:0:<method 'tolist' of 'numpy.ndarray' objects> (<method 'tolist' of 'numpy.ndarray' objects>)
                <method 'tran...ay' objects> --  100  100 -- 0.00076 0.00076 -- ~:0:<method 'transpose' of 'numpy.ndarray' objects> (<method 'transpose' of 'numpy.ndarray' objects>)
                <built-in met...umpy.arange> --  100  100 -- 0.00102 0.00102 -- ~:0:<built-in method numpy.arange> (<built-in method numpy.arange>) +++
            <built-in method builtins.len>   --  300  300 -- 0.00013 0.00013 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
    <built-in method builtins.len>           --  400  400 -- 0.00024 0.00024 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>)
    <built-in method numpy.arange>           --  300  300 -- 0.00271 0.00271 -- ~:0:<built-in method numpy.arange> (<built-in method numpy.arange>)


We can see that function ``_dft_cst`` is the bottle neck and more
precisely the exponential. We need to use the symmetries of the matrix
it builds.

Faster \_dft_cst
----------------

The function builds the matrix
:math:`M_{nk} = \left( \exp\left(\frac{-2i\pi nk}{K}\right) \right)_{nk}`
where :math:`1 \leqslant n \leqslant N` and
:math:`1 \leqslant k \leqslant K`. So it computes powers of the unity
roots.

.. math::


   \exp\left(\frac{-2i\pi nk}{K}\right) = \exp\left(\frac{-2i\pi k}{K}\right)^n = \exp\left(\frac{-2i\pi}{K}\right)^{nk}

We use that expression to reduce the number of exponentiels to compute.

.. code:: ipython3

    import numpy
    from numpy.testing import assert_almost_equal
    
    def _dft_cst(N, fft_length, dtype=numpy.float32):
        def _arange(dim, dtype, resh):
            return numpy.arange(dim).astype(dtype).reshape(resh)
    
        n = _arange(N, dtype, (-1, 1))
        k = _arange(fft_length, dtype, (1, -1))
        M = (-2j * numpy.pi * k / fft_length) * n
        numpy.exp(M, out=M)
        return M
    
    
    M = _dft_cst(3, 4, numpy.float32)
    M.shape, M.dtype


.. parsed-literal::
    ((3, 4), dtype('complex64'))


.. code:: ipython3

    M = _dft_cst(4, 3, numpy.float64)
    M.shape, M.dtype


.. parsed-literal::
    ((4, 3), dtype('complex128'))


.. code:: ipython3

    M


.. parsed-literal::
    array([[ 1. +0.00000000e+00j,  1. +0.00000000e+00j,  1. +0.00000000e+00j],
           [ 1. +0.00000000e+00j, -0.5-8.66025404e-01j, -0.5+8.66025404e-01j],
           [ 1. +0.00000000e+00j, -0.5+8.66025404e-01j, -0.5-8.66025404e-01j],
           [ 1. +0.00000000e+00j,  1. +2.44929360e-16j,  1. +4.89858720e-16j]])


.. code:: ipython3

    def _dft_cst_power(N, fft_length, dtype=numpy.float32):
        if dtype == numpy.float32:
            ctype = numpy.complex64
        else:
            ctype = numpy.complex128
        M = numpy.empty((N, fft_length), dtype=ctype)
        M[0, :] = 1
        M[1, 0] = 1
        root = numpy.exp(numpy.pi / fft_length * (-2j))
        current = root
        M[1, 1] = root
        for i in range(2, M.shape[1]):
            current *= root
            M[1, i] = current
        for i in range(2, M.shape[0]):
            numpy.multiply(M[i-1, :], M[1, :], out=M[i, :])
        return M
    
    M_pow = _dft_cst_power(4, 3, numpy.float64)
    M_pow


.. parsed-literal::
    array([[ 1. +0.00000000e+00j,  1. +0.00000000e+00j,  1. +0.00000000e+00j],
           [ 1. +0.00000000e+00j, -0.5-8.66025404e-01j, -0.5+8.66025404e-01j],
           [ 1. +0.00000000e+00j, -0.5+8.66025404e-01j, -0.5-8.66025404e-01j],
           [ 1. +0.00000000e+00j,  1. +6.10622664e-16j,  1. +1.22124533e-15j]])


.. code:: ipython3

    assert_almost_equal(M, M_pow)

.. code:: ipython3

    dims = (10, 15)
    assert_almost_equal(_dft_cst(*dims, dtype=numpy.float32), 
                        _dft_cst_power(*dims, dtype=numpy.float32),
                        decimal=5)

Benchmark again
---------------

.. code:: ipython3

    def custom_fftn_power(*args, **kwargs):
        return custom_fftn(*args, dft_fct=_dft_cst_power, **kwargs)
    
    
    %timeit -r 1 -n 1 test_fct(numpy_fftn, custom_fftn_power, decimal=4)


.. parsed-literal::
    1.46 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


.. code:: ipython3

    df = benchmark({
        'numpy_fftn': numpy_fftn, 'torch_fftn': torch_fftn, 'custom_fftn': custom_fftn, 
        'custom_fftn_power': custom_fftn_power})
    piv = df.pivot("length", "name", "average")
    piv[:5]


.. parsed-literal::
    100%|██████████| 24/24 [00:07<00:00,  3.19it/s]


.. raw:: html

    <div>
    <style scoped>
        .dataframe tbody tr th:only-of-type {
            vertical-align: middle;
        }

        .dataframe tbody tr th {
            vertical-align: top;
        }

        .dataframe thead th {
            text-align: right;
        }
    </style>
    <table border="1" class="dataframe">
      <thead>
        <tr style="text-align: right;">
          <th>name</th>
          <th>custom_fftn</th>
          <th>custom_fftn_power</th>
          <th>numpy_fftn</th>
          <th>torch_fftn</th>
        </tr>
        <tr>
          <th>length</th>
          <th></th>
          <th></th>
          <th></th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <th>8</th>
          <td>0.000991</td>
          <td>0.000837</td>
          <td>0.001177</td>
          <td>0.007033</td>
        </tr>
        <tr>
          <th>16</th>
          <td>0.002758</td>
          <td>0.002591</td>
          <td>0.002069</td>
          <td>0.006228</td>
        </tr>
        <tr>
          <th>24</th>
          <td>0.003087</td>
          <td>0.002816</td>
          <td>0.002499</td>
          <td>0.005564</td>
        </tr>
        <tr>
          <th>32</th>
          <td>0.003767</td>
          <td>0.003068</td>
          <td>0.003306</td>
          <td>0.005985</td>
        </tr>
        <tr>
          <th>40</th>
          <td>0.004710</td>
          <td>0.003975</td>
          <td>0.004044</td>
          <td>0.005733</td>
        </tr>
      </tbody>
    </table>
    </div>


.. code:: ipython3

    piv.plot(logy=True, logx=True, title="FFT benchmark 2", figsize=(12, 4));


.. image:: onnx_ffts_29_0.png


.. code:: ipython3

    from pyquickhelper.pycode.profiling import profile2graph, profile
    
    shape = [512, 128]
    fft_length = [128]
    axes = [1]
    rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    
    def f():
        for i in range(100):
            custom_fftn_power(rnd, 'FFT', fft_length, axes)
    
    stat, text = profile(f)
    gr = profile2graph(stat)
    print(gr[0].to_text(fct_width=40))


.. parsed-literal::
    f                                        --    1    1 -- 0.02624 0.57688 -- <ipython-input-92-112d00957d81>:8:f (f)
        custom_fftn_power                    --  100  100 -- 0.00094 0.55064 -- <ipython-input-88-b403af8c0b43>:1:custom_fftn_power (custom_fftn_power)
            custom_fftn                      --  100  100 -- 0.00609 0.54970 -- <ipython-input-7-85a4c9f552d3>:57:custom_fftn (custom_fftn)
                custom_fft                   --  100  100 -- 0.46378 0.54342 -- <ipython-input-7-85a4c9f552d3>:20:custom_fft (custom_fft)
                    _dft_cst_power           --  100  100 -- 0.07599 0.07726 -- <ipython-input-85-8502f1ddbe1f>:1:_dft_cst_power (_dft_cst_power)
                        <built-in...y.empty> --  100  100 -- 0.00126 0.00126 -- ~:0:<built-in method numpy.empty> (<built-in method numpy.empty>)
                    <built-in m...ltins.len> --  100  100 -- 0.00008 0.00008 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
                    <method 'to...' objects> --  100  100 -- 0.00025 0.00025 -- ~:0:<method 'tolist' of 'numpy.ndarray' objects> (<method 'tolist' of 'numpy.ndarray' objects>)
                    <method 'tr...' objects> --  100  100 -- 0.00096 0.00096 -- ~:0:<method 'transpose' of 'numpy.ndarray' objects> (<method 'transpose' of 'numpy.ndarray' objects>)
                    <built-in m...py.arange> --  100  100 -- 0.00109 0.00109 -- ~:0:<built-in method numpy.arange> (<built-in method numpy.arange>)
                <built-in met...uiltins.len> --  300  300 -- 0.00020 0.00020 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
    <built-in method builtins.len>           --  400  400 -- 0.00027 0.00027 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>)


Cooley–Tukey FFT algorithm
--------------------------

See `Cooley–Tukey FFT
algorithm <https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm>`__.

The FFT matrix is defined by the matrix computation
:math:`F_{ak} = X_{an} M_{nk}`, then one coefficient is
(:math:`1 \leqslant n, k \leqslant K`):

.. math::


   F_{ak} = \sum_n X_{an} M_{nk} = \sum_n X_{an} \exp\left(\frac{-2i\pi}{K}\right)^{nk}

Let’s assume K is even, then
:math:`\exp\left(\frac{-2i\pi k}{K}\right) = -\exp\left(\frac{-2i\pi \left(k + \frac{K}{2}\right)}{K}\right)`.

.. code:: ipython3

    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(6, 3))
    a = numpy.arange(0, 12) * (-2 * numpy.pi / 12)
    X = numpy.vstack([numpy.cos(a), numpy.sin(a)]).T
    ax.plot(X[:, 0], X[:, 1], 'o');
    for i in range(0, 12):
        ax.text(X[i, 0], X[i, 1], "exp(-2pi %d/12)" % i)
    ax.set_title('unit roots');


.. image:: onnx_ffts_32_0.png


Then:

.. math::


   \begin{array}{rcl}
   F_{a,k + \frac{K}{2}} &=& \sum_{n=1}^{N} X_{an} \exp\left(\frac{-2i\pi}{K}\right)^{n\left(k + \frac{K}{2}\right)} \\
   &=&\sum_{n=1}^{N} X_{an} (-1)^n \exp\left(\frac{-2i\pi}{K}\right)^{nk} \\
   &=&\sum_{m=1}^{\frac{N}{2}} X_{a,2m} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} - \sum_{m=1}^{\frac{N}{2}} X_{a,2m-1} \exp\left(\frac{-2i\pi}{K}\right)^{(2m-1)k} \\
   &=&\sum_{m=1}^{\frac{N}{2}} X_{a,2m} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} - \sum_{m=1}^{\frac{N}{2}} X_{a,2m-1} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} \exp\left(\frac{-2i\pi}{K}\right)^{-k}
   \end{array}

Then:

.. math::


   \begin{array}{rcl}
   F_{a,k} + F_{a,k+\frac{K}{2}} &=& 2\sum_{m=1}^{\frac{N}{2}} X_{a,2m}  \exp\left(\frac{-2i\pi}{K}\right)^{2mk}
   = 2\sum_{m=1}^{\frac{N}{2}} X_{a,2m}  \exp\left(\frac{-2i\pi}{\frac{K}{2}}\right)^{mk}
   \end{array}

Finally:

.. math::


   \begin{array}{rcl}
   F_{a,k} &=& \sum_{m=1}^{\frac{N}{2}} X_{a,2m} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} + \sum_{m=1}^{\frac{N}{2}} X_{a,2m-1} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} \exp\left(\frac{-2i\pi}{K}\right)^{-k} \\
   F_{a,k + \frac{K}{2}} &=&\sum_{m=1}^{\frac{N}{2}} X_{a,2m} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} - \sum_{m=1}^{\frac{N}{2}} X_{a,2m-1} \exp\left(\frac{-2i\pi}{K}\right)^{2mk} \exp\left(\frac{-2i\pi}{K}\right)^{-k}
   \end{array}

Now, what happen when *K* is odd, fallback to the original computation.

.. math::


   F_{ak} = \sum_n X_{an} M_{nk} = \sum_n X_{an} \exp\left(\frac{-2i\pi}{K}\right)^{nk}

.. code:: ipython3

    import functools
    
    
    def cooley_fft_2p(x, fft_length):
        cst = _dft_cst_power(x.shape[-1], fft_length, x.dtype)
        return numpy.matmul(x, cst)
    
    
    @functools.cache
    def _build_fact(p2_2, fft_length, dtype):
        first = numpy.exp(-2j * numpy.pi / fft_length)
        fact = numpy.ones(p2_2, dtype=dtype)
        for k in range(1, p2_2):
            fact[k] = fact[k-1] * first
        return fact.reshape((1, -1))
    
    
    def build_fact(p2_2, fft_length, dtype):
        return _build_fact(p2_2, fft_length, dtype)
    
    
    def cooley_fft_recursive(x, fft_length):
        if len(x.shape) != 2:
            raise RuntimeError(
                "Unexpected x.shape=%r." % (x.shape, ))
        dtype = numpy.complex128 if x.dtype == numpy.float64 else numpy.complex64
        if fft_length == 1:
            return x[:, :1].astype(dtype)
    
        if fft_length % 2 == 0:
            def split(x):
                even = x[:, ::2]
                odd = x[:, 1::2]
                return even, odd
    
            def tmp1(even, odd, fft_length):
                p2_2 = fft_length // 2
                fft_even = cooley_fft_recursive(even, p2_2)
                fft_odd = cooley_fft_recursive(odd, p2_2)
                return fft_even, fft_odd, p2_2
    
            def tmp2(x, fft_even, fft_odd, p2_2):
                fact = build_fact(p2_2, fft_length, fft_even.dtype)
    
                fact_odd = fft_odd * fact
                return numpy.hstack([fft_even + fact_odd, fft_even - fact_odd])
    
                # inplace
                # result = numpy.empty((x.shape[0], fft_length), dtype=fft_even.dtype)
                # numpy.multiply(fft_odd, fact, out=result[:, :p2_2])
                # numpy.subtract(fft_even, result[:, :p2_2], out=result[:, p2_2:])
                # numpy.add(fft_even, result[:, :p2_2], out=result[:, :p2_2])
                # return result
        
            even, odd = split(x)
            fft_even, fft_odd, p2_2 = tmp1(even, odd, fft_length)
            result = tmp2(x, fft_even, fft_odd, p2_2)
        else:
            result = cooley_fft_2p(x, fft_length)
            
        return result
    
    
    def cooley_fft(x, fft_length):
        return cooley_fft_recursive(x, fft_length)
    
    
    def custom_fft_cooley(x, fft_type, length, axis):
        # https://github.com/numpy/numpy/blob/4adc87dff15a247e417d50f10cc4def8e1c17a03/numpy/fft/_pocketfft.py#L56
        if fft_type == 'FFT':
            if x.shape[axis] > length:
                # fft_length > shape on the same axis
                # the matrix is shortened
                slices = [slice(None)] * len(x.shape)
                slices[axis] = slice(0, length)
                new_x = x[tuple(slices)]
            elif x.shape[axis] == length:
                new_x = x
            else:
                # other, the matrix is completed with zeros
                shape = list(x.shape)
                shape[axis] = length
                slices = [slice(None)] * len(x.shape)
                slices[axis] = slice(0, length)
                zeros = numpy.zeros(tuple(shape), dtype=x.dtype)
                index = [slice(0, i) for i in x.shape]
                zeros[tuple(index)] = x
                new_x = zeros
    
            if axis == len(new_x.shape) - 1:
                if len(new_x.shape) != 2:
                    xt = new_x.reshape((-1, new_x.shape[-1]))
                else:
                    xt = new_x
                res = cooley_fft(xt, length)
                if len(new_x.shape) != 2:
                    res = res.reshape(new_x.shape[:-1] + (-1, ))
            else:
                perm = numpy.arange(len(x.shape)).tolist()        
                perm[axis], perm[-1] = perm[-1], perm[axis]            
                rest = new_x.transpose(perm)
                shape = rest.shape[:-1]
                rest = rest.reshape((-1, rest.shape[-1]))
                res = cooley_fft(rest, length)
                res = res.reshape(shape + (-1, )).transpose(perm)
                perm[axis], perm[0] = perm[0], perm[axis]
            return res
        raise ValueError("Unexpected value for fft_type=%r." % fft_type)
    
    
    def custom_fftn_cooley(x, fft_type, fft_length, axes):
        if len(axes) != len(fft_length):
            raise ValueError("Length mismatch axes=%r, fft_length=%r." % (
               axes, fft_length))
        if fft_type == 'FFT':
            res = x
            for i in range(len(fft_length) - 1, -1, -1):
                length = fft_length[i]
                axis = axes[i]
                res = custom_fft_cooley(res, fft_type, length, axis)
            return res
        raise ValueError("Unexpected value for fft_type=%r." % fft_type)
        
    
    shape = (4, )
    fft_length = [3,]
    axes = [0]
    rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    assert_almost_equal(custom_fftn_cooley(rnd, 'FFT', fft_length, axes),
                        numpy_fftn(rnd, 'FFT', fft_length, axes),
                        decimal=5)
    %timeit -n 1 -r 1 test_fct(numpy_fftn, custom_fftn_cooley)


.. parsed-literal::
    1.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


.. code:: ipython3

    df = benchmark({
        'numpy_fftn': numpy_fftn, 'torch_fftn': torch_fftn,
        'custom_fftn_power': custom_fftn_power, 'custom_fftn_cooley': custom_fftn_cooley})
    piv = df.pivot("length", "name", "average")
    piv[:5]


.. parsed-literal::
    100%|██████████| 24/24 [00:10<00:00,  2.35it/s]


.. raw:: html

    <div>
    <style scoped>
        .dataframe tbody tr th:only-of-type {
            vertical-align: middle;
        }

        .dataframe tbody tr th {
            vertical-align: top;
        }

        .dataframe thead th {
            text-align: right;
        }
    </style>
    <table border="1" class="dataframe">
      <thead>
        <tr style="text-align: right;">
          <th>name</th>
          <th>custom_fftn_cooley</th>
          <th>custom_fftn_power</th>
          <th>numpy_fftn</th>
          <th>torch_fftn</th>
        </tr>
        <tr>
          <th>length</th>
          <th></th>
          <th></th>
          <th></th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <th>8</th>
          <td>0.002873</td>
          <td>0.000685</td>
          <td>0.001482</td>
          <td>0.005463</td>
        </tr>
        <tr>
          <th>16</th>
          <td>0.007197</td>
          <td>0.002121</td>
          <td>0.001922</td>
          <td>0.005063</td>
        </tr>
        <tr>
          <th>24</th>
          <td>0.009443</td>
          <td>0.002903</td>
          <td>0.002739</td>
          <td>0.005169</td>
        </tr>
        <tr>
          <th>32</th>
          <td>0.012783</td>
          <td>0.002556</td>
          <td>0.002003</td>
          <td>0.004076</td>
        </tr>
        <tr>
          <th>40</th>
          <td>0.014142</td>
          <td>0.003916</td>
          <td>0.003937</td>
          <td>0.005118</td>
        </tr>
      </tbody>
    </table>
    </div>


.. code:: ipython3

    piv.plot(logy=True, logx=True, title="FFT benchmark 3", figsize=(12, 4));


.. image:: onnx_ffts_37_0.png


.. code:: ipython3

    df = benchmark({
        'numpy_fftn': numpy_fftn, 'torch_fftn': torch_fftn,
        'custom_fftn_power': custom_fftn_power, 'custom_fftn_cooley': custom_fftn_cooley},
        power2=True)
    piv = df.pivot("length", "name", "average")
    piv[:5]


.. parsed-literal::
    100%|██████████| 10/10 [00:11<00:00,  1.15s/it]


.. raw:: html

    <div>
    <style scoped>
        .dataframe tbody tr th:only-of-type {
            vertical-align: middle;
        }

        .dataframe tbody tr th {
            vertical-align: top;
        }

        .dataframe thead th {
            text-align: right;
        }
    </style>
    <table border="1" class="dataframe">
      <thead>
        <tr style="text-align: right;">
          <th>name</th>
          <th>custom_fftn_cooley</th>
          <th>custom_fftn_power</th>
          <th>numpy_fftn</th>
          <th>torch_fftn</th>
        </tr>
        <tr>
          <th>length</th>
          <th></th>
          <th></th>
          <th></th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <th>2</th>
          <td>0.000575</td>
          <td>0.000471</td>
          <td>0.000722</td>
          <td>0.019371</td>
        </tr>
        <tr>
          <th>4</th>
          <td>0.001153</td>
          <td>0.000328</td>
          <td>0.001130</td>
          <td>0.018366</td>
        </tr>
        <tr>
          <th>8</th>
          <td>0.003678</td>
          <td>0.000624</td>
          <td>0.001779</td>
          <td>0.019295</td>
        </tr>
        <tr>
          <th>16</th>
          <td>0.006843</td>
          <td>0.002255</td>
          <td>0.002192</td>
          <td>0.020169</td>
        </tr>
        <tr>
          <th>32</th>
          <td>0.015574</td>
          <td>0.003045</td>
          <td>0.002736</td>
          <td>0.017193</td>
        </tr>
      </tbody>
    </table>
    </div>


.. code:: ipython3

    piv.plot(logy=True, logx=True, title="FFT benchmark 3 (power2)", figsize=(12, 4));


.. image:: onnx_ffts_39_0.png


.. code:: ipython3

    from pyquickhelper.pycode.profiling import profile2graph, profile
    
    shape = [512, 256]
    fft_length = [256]
    axes = [1]
    rnd = numpy.random.randn(*shape) + numpy.random.randn(*shape) * 1j
    
    def f():
        for i in range(100):
            custom_fftn_cooley(rnd, 'FFT', fft_length, axes)
    
    stat, text = profile(f)
    gr = profile2graph(stat)
    print(gr[0].to_text(fct_width=40))


.. parsed-literal::
    cooley_fft_recursive                     --    100  51100 -- 0.24497 2.68339 -- <ipython-input-139-b9d3f22689f8>:22:cooley_fft_recursive (cooley_fft_recursive)
        split                                --  25500  25500 -- 0.06264 0.06264 -- <ipython-input-139-b9d3f22689f8>:31:split (split)
        tmp1                                 --    100  25500 -- 0.09438 2.54540 -- <ipython-input-139-b9d3f22689f8>:36:tmp1 (tmp1)
            cooley_fft_recursive             --  51000    200 -- 0.24336 2.54421 -- <ipython-input-139-b9d3f22689f8>:22:cooley_fft_recursive (cooley_fft_recursive) +++
        tmp2                                 --  25500  25500 -- 0.95948 2.04473 -- <ipython-input-139-b9d3f22689f8>:42:tmp2 (tmp2)
            hstack                           --  25500  25500 -- 0.04799 1.05776 -- <__array_function__ internals>:177:hstack (hstack)
                _vhstack_dispatcher          --  25500  25500 -- 0.02712 0.07002 -- C:/Python395_x64/lib/site-packages/numpy/core/shape_base.py:218:_vhstack_dispatcher (_vhstack_dispatcher)
                    _arrays_for...dispatcher --  25500  25500 -- 0.02361 0.04290 -- C:/Python395_x64/lib/site-packages/numpy/core/shape_base.py:207:_arrays_for_stack_dispatcher (_arrays_for_stack_dispatcher)
                        <built-in...hasattr> --  25500  25500 -- 0.01929 0.01929 -- ~:0:<built-in method builtins.hasattr> (<built-in method builtins.hasattr>)
                <built-in met...ay_function> --  25500  25500 -- 0.03753 0.93975 -- ~:0:<built-in method numpy.core._multiarray_umath.implement_array_function> (<built-in method numpy.core._multiarray_umath.implement_array_function>) +++
            build_fact                       --  25500  25500 -- 0.02749 0.02749 -- <ipython-input-139-b9d3f22689f8>:18:build_fact (build_fact)
        <built-in method builtins.len>       --  51100  51100 -- 0.01521 0.01521 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
        <method 'astype' ...darray' objects> --  25600  25600 -- 0.22146 0.22146 -- ~:0:<method 'astype' of 'numpy.ndarray' objects> (<method 'astype' of 'numpy.ndarray' objects>)
    f                                        --      1      1 -- 0.01449 2.70167 -- <ipython-input-144-55e663ef5e2e>:8:f (f)
        custom_fftn_cooley                   --    100    100 -- 0.00139 2.68718 -- <ipython-input-139-b9d3f22689f8>:112:custom_fftn_cooley (custom_fftn_cooley)
            custom_fft_cooley                --    100    100 -- 0.00135 2.68568 -- <ipython-input-139-b9d3f22689f8>:69:custom_fft_cooley (custom_fft_cooley)
                cooley_fft                   --    100    100 -- 0.00082 2.68421 -- <ipython-input-139-b9d3f22689f8>:65:cooley_fft (cooley_fft)
                    cooley_fft_recursive     --    100    100 -- 0.00160 2.68339 -- <ipython-input-139-b9d3f22689f8>:22:cooley_fft_recursive (cooley_fft_recursive) +++
                <built-in met...uiltins.len> --    300    300 -- 0.00012 0.00012 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
            <built-in method builtins.len>   --    300    300 -- 0.00011 0.00011 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
    <built-in method builtins.len>           --  77200  77200 -- 0.02367 0.02367 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>)
    <built-in method nu...nt_array_function> --  25500  76500 -- 0.58675 0.93975 -- ~:0:<built-in method numpy.core._multiarray_umath.implement_array_function> (<built-in method numpy.core._multiarray_umath.implement_array_function>)
        atleast_1d                           --  25500  25500 -- 0.09562 0.13747 -- C:/Python395_x64/lib/site-packages/numpy/core/shape_base.py:23:atleast_1d (atleast_1d)
            <method 'append...list' objects> --  51000  51000 -- 0.01708 0.01708 -- ~:0:<method 'append' of 'list' objects> (<method 'append' of 'list' objects>)
            <built-in method builtins.len>   --  25500  25500 -- 0.00822 0.00822 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>) +++
            <built-in metho...py.asanyarray> --  51000  51000 -- 0.01655 0.01655 -- ~:0:<built-in method numpy.asanyarray> (<built-in method numpy.asanyarray>)
        hstack                               --  25500  25500 -- 0.09871 0.90222 -- C:/Python395_x64/lib/site-packages/numpy/core/shape_base.py:285:hstack (hstack)
            concatenate                      --  25500  25500 -- 0.04882 0.57709 -- <__array_function__ internals>:177:concatenate (concatenate)
                concatenate                  --  25500  25500 -- 0.01049 0.01049 -- C:/Python395_x64/lib/site-packages/numpy/core/multiarray.py:148:concatenate (concatenate)
                <built-in met...ay_function> --  25500  25500 -- 0.51778 0.51778 -- ~:0:<built-in method numpy.core._multiarray_umath.implement_array_function> (<built-in method numpy.core._multiarray_umath.implement_array_function>) +++
            atleast_1d                       --  25500  25500 -- 0.04022 0.21751 -- <__array_function__ internals>:177:atleast_1d (atleast_1d)
                _atleast_1d_dispatcher       --  25500  25500 -- 0.00838 0.00838 -- C:/Python395_x64/lib/site-packages/numpy/core/shape_base.py:19:_atleast_1d_dispatcher (_atleast_1d_dispatcher)
                <built-in met...ay_function> --  25500  25500 -- 0.03144 0.16891 -- ~:0:<built-in method numpy.core._multiarray_umath.implement_array_function> (<built-in method numpy.core._multiarray_umath.implement_array_function>) +++
            <built-in metho...ns.isinstance> --  25500  25500 -- 0.00892 0.00892 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>)