From 92ca9fc032ecaa7f9595f5c5f7d9d8df6c972bbe Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 19 May 2009 00:21:04 -0400
Subject: [PATCH 01/18] initial pass of FFT module -- includes complex 1-d case
 only

---
 unsupported/Eigen/FFT.h                       |  84 +++++
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 296 ++++++++++++++++++
 unsupported/test/CMakeLists.txt               |   1 +
 unsupported/test/FFT.cpp                      |  85 +++++
 4 files changed, 466 insertions(+)
 create mode 100644 unsupported/Eigen/FFT.h
 create mode 100644 unsupported/Eigen/src/FFT/simple_fft_traits.h
 create mode 100644 unsupported/test/FFT.cpp
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
new file mode 100644
index 000000000..03490d2c5
--- /dev/null
+++ b/unsupported/Eigen/FFT.h
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_FFT_H
+#define EIGEN_FFT_H
+
+// simple_fft_traits:  small, free, reasonably efficient default, derived from kissfft
+#include "src/FFT/simple_fft_traits.h"
+#define DEFAULT_FFT_TRAITS simple_fft_traits
+
+// FFTW: faster, GPL-not LGPL, bigger code size
+#ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
+// TODO 
+// #include "src/FFT/fftw_traits.h"
+// #define DEFAULT_FFT_TRAITS fftw_traits
+#endif
+
+// intel Math Kernel Library: fastest, commerical
+#ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
+// TODO 
+// #include "src/FFT/imkl_traits.h"
+// #define DEFAULT_FFT_TRAITS imkl_traits
+#endif
+
+namespace Eigen {
+
+template <typename _Scalar,
+         typename _Traits=DEFAULT_FFT_TRAITS<_Scalar> 
+         >
+class FFT
+{
+  public:
+    typedef _Traits traits_type;
+    typedef typename traits_type::Scalar Scalar;
+    typedef typename traits_type::Complex Complex;
+
+    FFT(const traits_type & traits=traits_type() ) :m_traits(traits) { }
+
+    void fwd( Complex * dst, const Complex * src, int nfft)
+    {
+      m_traits.prepare(nfft,false,dst,src);
+      m_traits.exec(dst,src);
+      m_traits.postprocess(dst);
+    }
+
+    void inv( Complex * dst, const Complex * src, int nfft)
+    {
+      m_traits.prepare(nfft,true,dst,src);
+      m_traits.exec(dst,src);
+      m_traits.postprocess(dst);
+    }
+
+    // TODO: fwd,inv for Scalar
+    // TODO: multi-dimensional FFTs
+    // TODO: handle Eigen MatrixBase
+
+    traits_type & traits() {return m_traits;}
+  private:
+    traits_type m_traits;
+};
+#undef DEFAULT_FFT_TRAITS
+}
+#endif
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
new file mode 100644
index 000000000..fe9d24b84
--- /dev/null
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include <complex>
+#include <vector>
+
+namespace Eigen {
+
+  template <typename _Scalar>
+  struct simple_fft_traits
+  {
+    typedef _Scalar Scalar;
+    typedef std::complex<Scalar> Complex;
+    simple_fft_traits() : m_nfft(0) {} 
+
+    void prepare(int nfft,bool inverse,Complex * dst,const Complex *src)
+    {
+      if (m_nfft == nfft) {
+        // reuse the twiddles, conjugate if necessary
+        if (inverse != m_inverse)
+          for (int i=0;i<nfft;++i)
+            m_twiddles[i] = conj( m_twiddles[i] );
+        m_inverse = inverse;
+        return;
+      }
+      m_nfft = nfft;
+      m_inverse = inverse;
+      m_twiddles.resize(nfft);
+      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+      for (int i=0;i<nfft;++i)
+        m_twiddles[i] = exp( Complex(0,i*phinc) );
+
+      m_stageRadix.resize(0);
+      m_stageRemainder.resize(0);
+      //factorize
+      //start factoring out 4's, then 2's, then 3,5,7,9,...
+      int n= nfft;
+      int p=4;
+      do {
+        while (n % p) {
+          switch (p) {
+            case 4: p = 2; break;
+            case 2: p = 3; break;
+            default: p += 2; break;
+          }
+          if (p*p>n)
+            p=n;// no more factors
+        }
+        n /= p;
+        m_stageRadix.push_back(p);
+        m_stageRemainder.push_back(n);
+      }while(n>1);
+    }
+
+    void exec(Complex * dst, const Complex * src)
+    {
+      work(0, dst, src, 1,1);
+    }
+
+    void postprocess(Complex *dst) 
+    {
+        if (m_inverse) {
+            Scalar scale = 1./m_nfft;
+            for (int k=0;k<m_nfft;++k)
+                dst[k] *= scale;
+        }
+    }
+  private:
+    void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
+    {
+      int p = m_stageRadix[stage];
+      int m = m_stageRemainder[stage];
+      Complex * Fout_beg = Fout;
+      Complex * Fout_end = Fout + p*m;
+
+      if (m==1) {
+        do{
+          *Fout = *f;
+          f += fstride*in_stride;
+        }while(++Fout != Fout_end );
+      }else{
+        do{
+          // recursive call:
+          // DFT of size m*p performed by doing
+          // p instances of smaller DFTs of size m, 
+          // each one takes a decimated version of the input
+          work(stage+1, Fout , f, fstride*p,in_stride);
+          f += fstride*in_stride;
+        }while( (Fout += m) != Fout_end );
+      }
+
+      Fout=Fout_beg;
+
+      // recombine the p smaller DFTs 
+      switch (p) {
+        case 2: bfly2(Fout,fstride,m); break;
+        case 3: bfly3(Fout,fstride,m); break;
+        case 4: bfly4(Fout,fstride,m); break;
+        case 5: bfly5(Fout,fstride,m); break;
+        default: bfly_generic(Fout,fstride,m,p); break;
+      }
+    }
+
+    void bfly2( Complex * Fout, const size_t fstride, int m)
+    {
+      for (int k=0;k<m;++k) {
+        Complex t = Fout[m+k] * m_twiddles[k*fstride];
+        Fout[m+k] = Fout[k] - t;
+        Fout[k] += t;
+      }
+    }
+
+    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex scratch[7];
+      int negative_if_inverse = m_inverse * -2 +1;
+      for (size_t k=0;k<m;++k) {
+        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+        scratch[5] = Fout[k] - scratch[1];
+
+        Fout[k] += scratch[1];
+        scratch[3] = scratch[0] + scratch[2];
+        scratch[4] = scratch[0] - scratch[2];
+        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+        Fout[k+2*m]  = Fout[k] - scratch[3];
+        Fout[k] += scratch[3];
+        Fout[k+m] = scratch[5] + scratch[4];
+        Fout[k+3*m] = scratch[5] - scratch[4];
+      }
+    }
+
+    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      size_t k=m;
+      const size_t m2 = 2*m;
+      Complex *tw1,*tw2;
+      Complex scratch[5];
+      Complex epi3;
+      epi3 = m_twiddles[fstride*m];
+
+      tw1=tw2=&m_twiddles[0];
+
+      do{
+        scratch[1]=Fout[m] * *tw1;
+        scratch[2]=Fout[m2] * *tw2;
+
+        scratch[3]=scratch[1]+scratch[2];
+        scratch[0]=scratch[1]-scratch[2];
+        tw1 += fstride;
+        tw2 += fstride*2;
+
+        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+
+        scratch[0] *= epi3.imag();
+
+        *Fout += scratch[3];
+
+        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+
+        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+        ++Fout;
+      }while(--k);
+    }
+
+    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+      size_t u;
+      Complex scratch[13];
+      Complex * twiddles = &m_twiddles[0];
+      Complex *tw;
+      Complex ya,yb;
+      ya = twiddles[fstride*m];
+      yb = twiddles[fstride*2*m];
+
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      tw=twiddles;
+      for ( u=0; u<m; ++u ) {
+        scratch[0] = *Fout0;
+
+        scratch[1]  = *Fout1 * tw[u*fstride];
+        scratch[2]  = *Fout2 * tw[2*u*fstride];
+        scratch[3]  = *Fout3 * tw[3*u*fstride];
+        scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+        scratch[7] = scratch[1] + scratch[4];
+        scratch[10] = scratch[1] - scratch[4];
+        scratch[8] = scratch[2] + scratch[3];
+        scratch[9] = scratch[2] - scratch[3];
+
+        *Fout0 +=  scratch[7];
+        *Fout0 +=  scratch[8];
+
+        scratch[5] = scratch[0] + Complex(
+            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+            );
+
+        scratch[6] = Complex(
+            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+            );
+
+        *Fout1 = scratch[5] - scratch[6];
+        *Fout4 = scratch[5] + scratch[6];
+
+        scratch[11] = scratch[0] +
+          Complex(
+              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+              );
+
+        scratch[12] = Complex(
+            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+            );
+
+        *Fout2=scratch[11]+scratch[12];
+        *Fout3=scratch[11]-scratch[12];
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+    }
+
+    /* perform the butterfly for one stage of a mixed radix FFT */
+    void bfly_generic(
+        Complex * Fout,
+        const size_t fstride,
+        int m,
+        int p
+        )
+    {
+      int u,k,q1,q;
+      Complex * twiddles = &m_twiddles[0];
+      Complex t;
+      int Norig = m_nfft;
+      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+      for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          scratchbuf[q1] = Fout[ k  ];
+          k += m;
+        }
+
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          int twidx=0;
+          Fout[ k ] = scratchbuf[0];
+          for (q=1;q<p;++q ) {
+            twidx += fstride * k;
+            if (twidx>=Norig) twidx-=Norig;
+            t=scratchbuf[q] * twiddles[twidx];
+            Fout[ k ] += t;
+          }
+          k += m;
+        }
+      }
+    }
+
+    int m_nfft;
+    bool m_inverse;
+    std::vector<Complex> m_twiddles;
+    std::vector<int> m_stageRadix;
+    std::vector<int> m_stageRemainder;
+  };
+}
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index abfbb0185..49de7dfb2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -19,3 +19,4 @@ ei_add_test(autodiff)
 ei_add_test(BVH)
 ei_add_test(matrixExponential)
 ei_add_test(alignedvector3)
+ei_add_test(FFT)
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
new file mode 100644
index 000000000..b13a7810f
--- /dev/null
+++ b/unsupported/test/FFT.cpp
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+#include <unsupported/Eigen/FFT.h>
+
+//#include <iostream>
+//#include <cstdlib>
+//#include <typeinfo>
+
+using namespace std;
+
+template <class T>
+void test_fft(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+
+    //cout << "type:" << typeid(T).name() << " nfft:" << nfft;
+
+    FFT<T> fft;
+
+    vector<Complex> inbuf(nfft);
+    vector<Complex> buf3(nfft);
+    vector<Complex> outbuf(nfft);
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( 
+                (T)(rand()/(double)RAND_MAX - .5),
+                (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( &outbuf[0] , &inbuf[0] ,nfft);
+    fft.inv( &buf3[0] , &outbuf[0] ,nfft);
+
+    long double totalpower=0;
+    long double difpower=0;
+    for (int k0=0;k0<nfft;++k0) {
+        complex<long double> acc = 0;
+        long double phinc = 2*k0* M_PIl / nfft;
+        for (int k1=0;k1<nfft;++k1) {
+            complex<long double> x(inbuf[k1].real(),inbuf[k1].imag()); 
+            acc += x * exp( complex<long double>(0,-k1*phinc) );
+        }
+        totalpower += norm(acc);
+        complex<long double> x(outbuf[k0].real(),outbuf[k0].imag()); 
+        complex<long double> dif = acc - x;
+        difpower += norm(dif);
+    }
+    long double rmse = sqrt(difpower/totalpower);
+    VERIFY( rmse < 1e-5 );// gross check
+
+    totalpower=0;
+    difpower=0;
+    for (int k=0;k<nfft;++k) {
+        totalpower += norm( inbuf[k] );
+        difpower += norm(inbuf[k] - buf3[k]);
+    }
+    rmse = sqrt(difpower/totalpower);
+    VERIFY( rmse < 1e-5 );// gross check
+}
+
+void test_FFT()
+{
+  CALL_SUBTEST(( test_fft<float>(32) )); CALL_SUBTEST(( test_fft<double>(32) )); CALL_SUBTEST(( test_fft<long double>(32) ));
+  CALL_SUBTEST(( test_fft<float>(1024) )); CALL_SUBTEST(( test_fft<double>(1024) )); CALL_SUBTEST(( test_fft<long double>(1024) ));
+  CALL_SUBTEST(( test_fft<float>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<double>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<long double>(2*3*4*5*7) ));
+}

From be1b2ad4ec1851aa990dc97337fc74e16edd1dd0 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 19 May 2009 00:26:19 -0400
Subject: [PATCH 02/18] removed unused code

---
 unsupported/test/FFT.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index b13a7810f..8347bb76b 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -25,10 +25,6 @@
 #include "main.h"
 #include <unsupported/Eigen/FFT.h>
 
-//#include <iostream>
-//#include <cstdlib>
-//#include <typeinfo>
-
 using namespace std;
 
 template <class T>
@@ -36,8 +32,6 @@ void test_fft(int nfft)
 {
     typedef typename Eigen::FFT<T>::Complex Complex;
 
-    //cout << "type:" << typeid(T).name() << " nfft:" << nfft;
-
     FFT<T> fft;
 
     vector<Complex> inbuf(nfft);

From 68cad98bc935e53102a9432560085b81c5766743 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 19 May 2009 00:34:38 -0400
Subject: [PATCH 03/18] indent level change

---
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index fe9d24b84..6fbbeac2e 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -80,13 +80,15 @@ namespace Eigen {
 
     void postprocess(Complex *dst) 
     {
-        if (m_inverse) {
-            Scalar scale = 1./m_nfft;
-            for (int k=0;k<m_nfft;++k)
-                dst[k] *= scale;
-        }
+      if (m_inverse) {
+        Scalar scale = 1./m_nfft;
+        for (int k=0;k<m_nfft;++k)
+          dst[k] *= scale;
+      }
     }
-  private:
+
+    private:
+
     void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];

From 8b4afe3debb47bf15ea291a7f2d21d863d546536 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Fri, 22 May 2009 22:37:59 -0400
Subject: [PATCH 04/18] added non-optimized real forward fft (no inverse yet)

---
 unsupported/Eigen/FFT.h                       |  27 ++++-
 unsupported/Eigen/src/FFT/simple_fft_traits.h |  10 +-
 unsupported/test/FFT.cpp                      | 113 ++++++++++++------
 3 files changed, 106 insertions(+), 44 deletions(-)

diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
index 03490d2c5..a1f87a609 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT.h
@@ -57,21 +57,36 @@ class FFT
 
     FFT(const traits_type & traits=traits_type() ) :m_traits(traits) { }
 
-    void fwd( Complex * dst, const Complex * src, int nfft)
+    template <typename _Input>
+    void fwd( Complex * dst, const _Input * src, int nfft)
     {
       m_traits.prepare(nfft,false,dst,src);
       m_traits.exec(dst,src);
       m_traits.postprocess(dst);
     }
 
-    void inv( Complex * dst, const Complex * src, int nfft)
+    template <typename _Input>
+    void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src) 
     {
-      m_traits.prepare(nfft,true,dst,src);
-      m_traits.exec(dst,src);
-      m_traits.postprocess(dst);
+        dst.resize( src.size() );
+        fwd( &dst[0],&src[0],src.size() );
+    }
+
+    template <typename _Output>
+    void inv( _Output * dst, const Complex * src, int nfft)
+    {
+        m_traits.prepare(nfft,true,dst,src);
+        m_traits.exec(dst,src);
+        m_traits.postprocess(dst);
+    }
+
+    template <typename _Output>
+    void inv( std::vector<_Output> & dst, const std::vector<Complex> & src) 
+    {
+        dst.resize( src.size() );
+        inv( &dst[0],&src[0],src.size() );
     }
 
-    // TODO: fwd,inv for Scalar
     // TODO: multi-dimensional FFTs
     // TODO: handle Eigen MatrixBase
 
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 6fbbeac2e..5a910dd1f 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -34,7 +34,8 @@ namespace Eigen {
     typedef std::complex<Scalar> Complex;
     simple_fft_traits() : m_nfft(0) {} 
 
-    void prepare(int nfft,bool inverse,Complex * dst,const Complex *src)
+    template <typename _Src>
+    void prepare(int nfft,bool inverse,Complex * dst,const _Src *src)
     {
       if (m_nfft == nfft) {
         // reuse the twiddles, conjugate if necessary
@@ -73,7 +74,8 @@ namespace Eigen {
       }while(n>1);
     }
 
-    void exec(Complex * dst, const Complex * src)
+    template <typename _Src>
+    void exec(Complex * dst, const _Src * src)
     {
       work(0, dst, src, 1,1);
     }
@@ -89,7 +91,9 @@ namespace Eigen {
 
     private:
 
-    void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
+   
+    template <typename _Src>
+    void work( int stage,Complex * Fout, const _Src * f, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];
       int m = m_stageRemainder[stage];
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 8347bb76b..ef03359e2 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -25,55 +25,98 @@
 #include "main.h"
 #include <unsupported/Eigen/FFT.h>
 
+
 using namespace std;
 
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>( x); }
+complex<long double>  promote(double x) { return complex<long double>( x); }
+complex<long double>  promote(long double x) { return complex<long double>( x); }
+    
+
+    template <typename T1,typename T2>
+    long double fft_rmse( const vector<T1> & fftbuf,const vector<T2> & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        for (size_t k0=0;k0<fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = -2.*k0* M_PIl / timebuf.size();
+            for (size_t k1=0;k1<timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += norm(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += norm(dif);
+            cerr << k0 << ":" << acc << " " <<  x << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename T1,typename T2>
+    long double dif_rmse( const vector<T1> buf1,const vector<T2> buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = min( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (norm( buf1[k] ) + norm(buf2[k]) )/2.;
+            difpower += norm(buf1[k] - buf2[k]);
+        }
+        return sqrt(difpower/totalpower);
+    }
+
 template <class T>
-void test_fft(int nfft)
+void test_scalar(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    typedef typename Eigen::FFT<T>::Scalar Scalar;
+
+    FFT<T> fft;
+    vector<Scalar> inbuf(nfft);
+    vector<Complex> outbuf;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
+    fft.fwd( outbuf,inbuf);
+    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+}
+
+template <class T>
+void test_complex(int nfft)
 {
     typedef typename Eigen::FFT<T>::Complex Complex;
 
     FFT<T> fft;
 
     vector<Complex> inbuf(nfft);
-    vector<Complex> buf3(nfft);
-    vector<Complex> outbuf(nfft);
+    vector<Complex> outbuf;
+    vector<Complex> buf3;
     for (int k=0;k<nfft;++k)
-        inbuf[k]= Complex( 
-                (T)(rand()/(double)RAND_MAX - .5),
-                (T)(rand()/(double)RAND_MAX - .5) );
-    fft.fwd( &outbuf[0] , &inbuf[0] ,nfft);
-    fft.inv( &buf3[0] , &outbuf[0] ,nfft);
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
 
-    long double totalpower=0;
-    long double difpower=0;
-    for (int k0=0;k0<nfft;++k0) {
-        complex<long double> acc = 0;
-        long double phinc = 2*k0* M_PIl / nfft;
-        for (int k1=0;k1<nfft;++k1) {
-            complex<long double> x(inbuf[k1].real(),inbuf[k1].imag()); 
-            acc += x * exp( complex<long double>(0,-k1*phinc) );
-        }
-        totalpower += norm(acc);
-        complex<long double> x(outbuf[k0].real(),outbuf[k0].imag()); 
-        complex<long double> dif = acc - x;
-        difpower += norm(dif);
-    }
-    long double rmse = sqrt(difpower/totalpower);
-    VERIFY( rmse < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
 
-    totalpower=0;
-    difpower=0;
-    for (int k=0;k<nfft;++k) {
-        totalpower += norm( inbuf[k] );
-        difpower += norm(inbuf[k] - buf3[k]);
-    }
-    rmse = sqrt(difpower/totalpower);
-    VERIFY( rmse < 1e-5 );// gross check
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( dif_rmse(inbuf,buf3) < 1e-5 );// gross check
 }
 
 void test_FFT()
 {
-  CALL_SUBTEST(( test_fft<float>(32) )); CALL_SUBTEST(( test_fft<double>(32) )); CALL_SUBTEST(( test_fft<long double>(32) ));
-  CALL_SUBTEST(( test_fft<float>(1024) )); CALL_SUBTEST(( test_fft<double>(1024) )); CALL_SUBTEST(( test_fft<long double>(1024) ));
-  CALL_SUBTEST(( test_fft<float>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<double>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<long double>(2*3*4*5*7) ));
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 }

From 9c0fcd0f6213143216710a5b215aa2bb4a857ce5 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 23 May 2009 10:09:48 -0400
Subject: [PATCH 05/18] started real optimization, added benchmark for FFT

---
 bench/benchFFT.cpp                            |  64 +++++++++
 unsupported/Eigen/FFT.h                       |   8 +-
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 125 +++++++++++++-----
 unsupported/test/FFT.cpp                      |   3 +-
 4 files changed, 157 insertions(+), 43 deletions(-)
 create mode 100644 bench/benchFFT.cpp

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
new file mode 100644
index 000000000..041576b75
--- /dev/null
+++ b/bench/benchFFT.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include <complex>
+#include <vector>
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+#include <unsupported/Eigen/FFT.h>
+
+using namespace Eigen;
+using namespace std;
+
+#ifndef NFFT
+#define NFFT 1024
+#endif
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+#ifndef NITS 
+#define NITS (10000000/NFFT)
+#endif
+
+int main() 
+{
+  vector<complex<TYPE> > inbuf(NFFT);
+  vector<complex<TYPE> > outbuf(NFFT);
+  Eigen::FFT<TYPE> fft;
+
+  fft.fwd( outbuf , inbuf);
+
+  BenchTimer timer;
+  timer.reset();
+  for (int k=0;k<8;++k) {
+      timer.start();
+      for(int i = 0; i < NITS; i++)
+          fft.fwd( outbuf , inbuf);
+      timer.stop();
+  }
+  double mflops = 5.*NFFT*log2((double)NFFT) / (1e6 * timer.value() / (double)NITS );
+  cout << "NFFT=" << NFFT << "  " << (double(1e-6*NFFT*NITS)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+}
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
index a1f87a609..c466423b7 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT.h
@@ -60,9 +60,7 @@ class FFT
     template <typename _Input>
     void fwd( Complex * dst, const _Input * src, int nfft)
     {
-      m_traits.prepare(nfft,false,dst,src);
-      m_traits.exec(dst,src);
-      m_traits.postprocess(dst);
+        m_traits.fwd(dst,src,nfft);
     }
 
     template <typename _Input>
@@ -75,9 +73,7 @@ class FFT
     template <typename _Output>
     void inv( _Output * dst, const Complex * src, int nfft)
     {
-        m_traits.prepare(nfft,true,dst,src);
-        m_traits.exec(dst,src);
-        m_traits.postprocess(dst);
+        m_traits.inv( dst,src,nfft );
     }
 
     template <typename _Output>
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 5a910dd1f..33433ae03 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -35,7 +35,73 @@ namespace Eigen {
     simple_fft_traits() : m_nfft(0) {} 
 
     template <typename _Src>
-    void prepare(int nfft,bool inverse,Complex * dst,const _Src *src)
+    void fwd( Complex * dst,const _Src *src,int nfft)
+    {
+        prepare(nfft,false);
+        work(0, dst, src, 1,1);
+        scale(dst);
+    }
+
+    void fwd( Complex * dst,const Scalar * src,int nfft) 
+    {
+        if ( nfft&1 ) {
+            // use generic mode for odd
+            prepare(nfft,false);
+            work(0, dst, src, 1,1);
+            scale(dst);
+        }else{
+            int ncfft = nfft>>1;
+            // use optimized mode for even real
+            prepare(nfft,false);
+            work(0,dst, reinterpret_cast<const Complex*> (src),2,1);
+            Complex dc = dst[0].real() +  dst[0].imag();
+            Complex nyquist = dst[0].real() -  dst[0].imag();
+            
+            int k;
+            for ( k=1;k <= ncfft/2 ; ++k ) {
+/**
+        fpk    = st->tmpbuf[k];
+        fpnk.r =   st->tmpbuf[ncfft-k].r;
+        fpnk.i = - st->tmpbuf[ncfft-k].i;
+
+        C_ADD( f1k, fpk , fpnk );
+        C_SUB( f2k, fpk , fpnk );
+        C_MUL( tw , f2k , st->super_twiddles[k-1]);
+
+        freqdata[k].r = HALF_OF(f1k.r + tw.r);
+        freqdata[k].i = HALF_OF(f1k.i + tw.i);
+        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
+        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
+ */
+                Complex fpk = dst[k];
+                Complex fpnk = conj(dst[ncfft-k]);
+
+                Complex f1k = fpk + fpnk;
+                Complex f2k = fpnk - fpk;
+                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double)k / ncfft + 1) ) ); // TODO repalce this with index into twiddles
+                Complex tw = f2k * m_twiddles[2*k];;
+                
+                dst[k] =  (f1k + tw) * Scalar(.5);
+                dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
+            }
+            // place conjugate-symmetric half at the end for completeness
+            // TODO: make this configurable ( opt-out )
+            for ( k=1;k < ncfft ; ++k )
+                dst[nfft-k] = conj(dst[k]);
+
+            dst[0] = dc;
+            dst[ncfft] = nyquist;
+        }
+    }
+
+    void inv(Complex * dst,const Complex  *src,int nfft)
+    {
+        prepare(nfft,true);
+        work(0, dst, src, 1,1);
+        scale(dst);
+    }
+
+    void prepare(int nfft,bool inverse)
     {
       if (m_nfft == nfft) {
         // reuse the twiddles, conjugate if necessary
@@ -74,57 +140,49 @@ namespace Eigen {
       }while(n>1);
     }
 
-    template <typename _Src>
-    void exec(Complex * dst, const _Src * src)
-    {
-      work(0, dst, src, 1,1);
-    }
-
-    void postprocess(Complex *dst) 
+    void scale(Complex *dst) 
     {
       if (m_inverse) {
-        Scalar scale = 1./m_nfft;
+        Scalar s = 1./m_nfft;
         for (int k=0;k<m_nfft;++k)
-          dst[k] *= scale;
+          dst[k] *= s;
       }
     }
 
     private:
 
-   
     template <typename _Src>
-    void work( int stage,Complex * Fout, const _Src * f, size_t fstride,size_t in_stride)
+    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];
       int m = m_stageRemainder[stage];
-      Complex * Fout_beg = Fout;
-      Complex * Fout_end = Fout + p*m;
+      Complex * Fout_beg = xout;
+      Complex * Fout_end = xout + p*m;
 
-      if (m==1) {
-        do{
-          *Fout = *f;
-          f += fstride*in_stride;
-        }while(++Fout != Fout_end );
-      }else{
+      if (m>1) {
         do{
           // recursive call:
           // DFT of size m*p performed by doing
           // p instances of smaller DFTs of size m, 
           // each one takes a decimated version of the input
-          work(stage+1, Fout , f, fstride*p,in_stride);
-          f += fstride*in_stride;
-        }while( (Fout += m) != Fout_end );
+          work(stage+1, xout , xin, fstride*p,in_stride);
+          xin += fstride*in_stride;
+        }while( (xout += m) != Fout_end );
+      }else{
+          do{
+              *xout = *xin;
+              xin += fstride*in_stride;
+          }while(++xout != Fout_end );
       }
-
-      Fout=Fout_beg;
+      xout=Fout_beg;
 
       // recombine the p smaller DFTs 
       switch (p) {
-        case 2: bfly2(Fout,fstride,m); break;
-        case 3: bfly3(Fout,fstride,m); break;
-        case 4: bfly4(Fout,fstride,m); break;
-        case 5: bfly5(Fout,fstride,m); break;
-        default: bfly_generic(Fout,fstride,m,p); break;
+        case 2: bfly2(xout,fstride,m); break;
+        case 3: bfly3(xout,fstride,m); break;
+        case 4: bfly4(xout,fstride,m); break;
+        case 5: bfly5(xout,fstride,m); break;
+        default: bfly_generic(xout,fstride,m,p); break;
       }
     }
 
@@ -139,7 +197,7 @@ namespace Eigen {
 
     void bfly4( Complex * Fout, const size_t fstride, const size_t m)
     {
-      Complex scratch[7];
+      Complex scratch[6];
       int negative_if_inverse = m_inverse * -2 +1;
       for (size_t k=0;k<m;++k) {
         scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
@@ -178,15 +236,10 @@ namespace Eigen {
         scratch[0]=scratch[1]-scratch[2];
         tw1 += fstride;
         tw2 += fstride*2;
-
         Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-
         scratch[0] *= epi3.imag();
-
         *Fout += scratch[3];
-
         Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-
         Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
         ++Fout;
       }while(--k);
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index ef03359e2..13e98ba77 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -115,8 +115,9 @@ void test_FFT()
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-
+/*
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
   CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+  */
 }

From 304798817268706463f3ff645c8c8b2c887c090a Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 23 May 2009 12:50:07 -0400
Subject: [PATCH 06/18] scalar forward FFT optimized for even size, converts to
 cpx for odd

---
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 156 ++++++++++--------
 unsupported/test/FFT.cpp                      |  10 +-
 2 files changed, 97 insertions(+), 69 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 33433ae03..f7dd2b9cf 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -24,6 +24,7 @@
 
 #include <complex>
 #include <vector>
+#include <iostream>
 
 namespace Eigen {
 
@@ -39,51 +40,54 @@ namespace Eigen {
     {
         prepare(nfft,false);
         work(0, dst, src, 1,1);
-        scale(dst);
     }
 
+    // real-to-complex forward FFT
+    // perform two FFTs of src even and src odd
+    // then twiddle to recombine them into the half-spectrum format
+    // then fill in the conjugate symmetric half
     void fwd( Complex * dst,const Scalar * src,int nfft) 
     {
         if ( nfft&1 ) {
             // use generic mode for odd
             prepare(nfft,false);
             work(0, dst, src, 1,1);
-            scale(dst);
         }else{
             int ncfft = nfft>>1;
             // use optimized mode for even real
-            prepare(nfft,false);
-            work(0,dst, reinterpret_cast<const Complex*> (src),2,1);
+            fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
+            make_real_twiddles(nfft);
+            std::cerr << "dst[0] = " << dst[0] << "\n";
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
-            
             int k;
-            for ( k=1;k <= ncfft/2 ; ++k ) {
-/**
-        fpk    = st->tmpbuf[k];
-        fpnk.r =   st->tmpbuf[ncfft-k].r;
-        fpnk.i = - st->tmpbuf[ncfft-k].i;
-
-        C_ADD( f1k, fpk , fpnk );
-        C_SUB( f2k, fpk , fpnk );
-        C_MUL( tw , f2k , st->super_twiddles[k-1]);
-
-        freqdata[k].r = HALF_OF(f1k.r + tw.r);
-        freqdata[k].i = HALF_OF(f1k.i + tw.i);
-        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
-        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
- */
+#if 0
+            using namespace std;
+            cerr << "desired:\n";
+            for ( k=1;k <= (ncfft>>1) ; ++k ) {
+                Complex x = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
+                cerr << k << " " << x << "angle(x):" << arg(x) << "\n";
+            }
+            dc=0;
+            cerr << "twiddles:\n";
+            for (k=0;k<ncfft;++k) {
+                Complex x = m_twiddles[k];
+                cerr << k << " " << x << "angle(-x):" << arg(-x) << "\n";
+            }
+#endif
+            for ( k=1;k <= (ncfft>>1) ; ++k ) {
                 Complex fpk = dst[k];
                 Complex fpnk = conj(dst[ncfft-k]);
-
                 Complex f1k = fpk + fpnk;
-                Complex f2k = fpnk - fpk;
-                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double)k / ncfft + 1) ) ); // TODO repalce this with index into twiddles
-                Complex tw = f2k * m_twiddles[2*k];;
-                
+                Complex f2k = fpk - fpnk;
+                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
+                Complex tw= f2k * m_realTwiddles[k-1];
+
                 dst[k] =  (f1k + tw) * Scalar(.5);
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
             }
+ 
+
             // place conjugate-symmetric half at the end for completeness
             // TODO: make this configurable ( opt-out )
             for ( k=1;k < ncfft ; ++k )
@@ -98,55 +102,74 @@ namespace Eigen {
     {
         prepare(nfft,true);
         work(0, dst, src, 1,1);
-        scale(dst);
+        scale(dst, Scalar(1)/m_nfft );
     }
 
     void prepare(int nfft,bool inverse)
     {
-      if (m_nfft == nfft) {
-        // reuse the twiddles, conjugate if necessary
-        if (inverse != m_inverse)
-          for (int i=0;i<nfft;++i)
-            m_twiddles[i] = conj( m_twiddles[i] );
-        m_inverse = inverse;
-        return;
-      }
-      m_nfft = nfft;
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
-
-      m_stageRadix.resize(0);
-      m_stageRemainder.resize(0);
-      //factorize
-      //start factoring out 4's, then 2's, then 3,5,7,9,...
-      int n= nfft;
-      int p=4;
-      do {
-        while (n % p) {
-          switch (p) {
-            case 4: p = 2; break;
-            case 2: p = 3; break;
-            default: p += 2; break;
-          }
-          if (p*p>n)
-            p=n;// no more factors
-        }
-        n /= p;
-        m_stageRadix.push_back(p);
-        m_stageRemainder.push_back(n);
-      }while(n>1);
+        make_twiddles(nfft,inverse);
+        factorize(nfft);
     }
 
-    void scale(Complex *dst) 
+    void make_real_twiddles(int nfft)
+    {
+        int ncfft2 = nfft>>2;
+        if ( m_realTwiddles.size() != ncfft2) {
+            m_realTwiddles.resize(ncfft2);
+            int ncfft= nfft>>1;
+            for (int k=1;k<=ncfft2;++k) 
+                m_realTwiddles[k-1] = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
+        }
+    }
+
+    void make_twiddles(int nfft,bool inverse)
+    {
+        if ( m_twiddles.size() == nfft) {
+            // reuse the twiddles, conjugate if necessary
+            if (inverse != m_inverse)
+                for (int i=0;i<nfft;++i)
+                    m_twiddles[i] = conj( m_twiddles[i] );
+        }else{
+            m_twiddles.resize(nfft);
+            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+            for (int i=0;i<nfft;++i)
+                m_twiddles[i] = exp( Complex(0,i*phinc) );
+        }
+        m_inverse = inverse;
+    }
+
+    void factorize(int nfft)
+    {
+        if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
+        {
+            m_stageRadix.resize(0);
+            m_stageRemainder.resize(0);
+            //factorize
+            //start factoring out 4's, then 2's, then 3,5,7,9,...
+            int n= nfft;
+            int p=4;
+            do {
+                while (n % p) {
+                    switch (p) {
+                        case 4: p = 2; break;
+                        case 2: p = 3; break;
+                        default: p += 2; break;
+                    }
+                    if (p*p>n)
+                        p=n;// no more factors
+                }
+                n /= p;
+                m_stageRadix.push_back(p);
+                m_stageRemainder.push_back(n);
+            }while(n>1);
+        }
+        m_nfft = nfft;
+    }
+
+    void scale(Complex *dst,Scalar s) 
     {
-      if (m_inverse) {
-        Scalar s = 1./m_nfft;
         for (int k=0;k<m_nfft;++k)
-          dst[k] *= s;
-      }
+            dst[k] *= s;
     }
 
     private:
@@ -349,6 +372,7 @@ namespace Eigen {
     int m_nfft;
     bool m_inverse;
     std::vector<Complex> m_twiddles;
+    std::vector<Complex> m_realTwiddles;
     std::vector<int> m_stageRadix;
     std::vector<int> m_stageRemainder;
   };
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 13e98ba77..41c7fed7b 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -41,6 +41,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
     {
         long double totalpower=0;
         long double difpower=0;
+        cerr <<"idx\ttruth\t\tvalue\n";
         for (size_t k0=0;k0<fftbuf.size();++k0) {
             complex<long double> acc = 0;
             long double phinc = -2.*k0* M_PIl / timebuf.size();
@@ -51,7 +52,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
             complex<long double> x = promote(fftbuf[k0]); 
             complex<long double> dif = acc - x;
             difpower += norm(dif);
-            cerr << k0 << ":" << acc << " " <<  x << endl;
+            cerr << k0 << "\t" << acc << "\t" <<  x << endl;
         }
         cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
         return sqrt(difpower/totalpower);
@@ -108,6 +109,7 @@ void test_complex(int nfft)
 
 void test_FFT()
 {
+#if 0
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
   CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
@@ -115,9 +117,11 @@ void test_FFT()
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-/*
+#endif
+
+#if 1
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
   CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
-  */
+#endif
 }

From 326ea773908c2d7e46101085af8f72d20b3f8cbc Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 23 May 2009 22:50:07 -0400
Subject: [PATCH 07/18] added FFT inverse complex-to-scalar interface (not yet
 optimized)

---
 bench/benchFFT.cpp                            | 73 ++++++++++++++-----
 unsupported/Eigen/src/FFT/simple_fft_traits.h | 31 ++++----
 unsupported/test/FFT.cpp                      | 21 ++++--
 3 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 041576b75..84cc49fe3 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -31,34 +31,67 @@
 using namespace Eigen;
 using namespace std;
 
-#ifndef NFFT
-#define NFFT 1024
-#endif
+
+template <typename T>
+string nameof();
+
+template <> string nameof<float>() {return "float";}
+template <> string nameof<double>() {return "double";}
+template <> string nameof<long double>() {return "long double";}
 
 #ifndef TYPE
 #define TYPE float
 #endif
 
-#ifndef NITS 
-#define NITS (10000000/NFFT)
+#ifndef NFFT
+#define NFFT 1024
+#endif
+#ifndef NDATA
+#define NDATA 1000000
 #endif
 
-int main() 
+using namespace Eigen;
+
+template <typename T>
+void bench(int nfft)
 {
-  vector<complex<TYPE> > inbuf(NFFT);
-  vector<complex<TYPE> > outbuf(NFFT);
-  Eigen::FFT<TYPE> fft;
+    typedef typename NumTraits<T>::Real Scalar;
+    typedef typename std::complex<Scalar> Complex;
+    int nits = NDATA/nfft;
+    vector<T> inbuf(nfft);
+    vector<Complex > outbuf(nfft);
+    FFT< Scalar > fft;
 
-  fft.fwd( outbuf , inbuf);
+    fft.fwd( outbuf , inbuf);
 
-  BenchTimer timer;
-  timer.reset();
-  for (int k=0;k<8;++k) {
-      timer.start();
-      for(int i = 0; i < NITS; i++)
-          fft.fwd( outbuf , inbuf);
-      timer.stop();
-  }
-  double mflops = 5.*NFFT*log2((double)NFFT) / (1e6 * timer.value() / (double)NITS );
-  cout << "NFFT=" << NFFT << "  " << (double(1e-6*NFFT*NITS)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+    BenchTimer timer;
+    timer.reset();
+    for (int k=0;k<8;++k) {
+        timer.start();
+        for(int i = 0; i < nits; i++)
+            fft.fwd( outbuf , inbuf);
+        timer.stop();
+    }
+
+    cout << nameof<Scalar>() << " ";
+    double mflops = 5.*nfft*log2((double)nfft) / (1e6 * timer.value() / (double)nits );
+    if ( NumTraits<T>::IsComplex ) {
+        cout << "complex";
+    }else{
+        cout << "real   ";
+        mflops /= 2;
+    }
+
+    cout << " NFFT=" << nfft << "  " << (double(1e-6*nfft*nits)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+}
+
+int main(int argc,char ** argv)
+{
+    bench<complex<float> >(NFFT);
+    bench<float>(NFFT);
+    bench<complex<double> >(NFFT);
+    bench<double>(NFFT);
+    bench<complex<long double> >(NFFT);
+    bench<long double>(NFFT);
+    return 0;
 }
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index f7dd2b9cf..1e2be8f79 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -54,28 +54,14 @@ namespace Eigen {
             work(0, dst, src, 1,1);
         }else{
             int ncfft = nfft>>1;
+            int ncfft2 = nfft>>2;
             // use optimized mode for even real
             fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
             make_real_twiddles(nfft);
-            std::cerr << "dst[0] = " << dst[0] << "\n";
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
             int k;
-#if 0
-            using namespace std;
-            cerr << "desired:\n";
-            for ( k=1;k <= (ncfft>>1) ; ++k ) {
-                Complex x = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-                cerr << k << " " << x << "angle(x):" << arg(x) << "\n";
-            }
-            dc=0;
-            cerr << "twiddles:\n";
-            for (k=0;k<ncfft;++k) {
-                Complex x = m_twiddles[k];
-                cerr << k << " " << x << "angle(-x):" << arg(-x) << "\n";
-            }
-#endif
-            for ( k=1;k <= (ncfft>>1) ; ++k ) {
+            for ( k=1;k <= ncfft2 ; ++k ) {
                 Complex fpk = dst[k];
                 Complex fpnk = conj(dst[ncfft-k]);
                 Complex f1k = fpk + fpnk;
@@ -87,7 +73,6 @@ namespace Eigen {
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
             }
  
-
             // place conjugate-symmetric half at the end for completeness
             // TODO: make this configurable ( opt-out )
             for ( k=1;k < ncfft ; ++k )
@@ -98,6 +83,16 @@ namespace Eigen {
         }
     }
 
+    // half-complex to scalar
+    void inv( Scalar * dst,const Complex * src,int nfft) 
+    {
+        // TODO add optimized version for even numbers
+        std::vector<Complex> tmp(nfft);
+        inv(&tmp[0],src,nfft);
+        for (int k=0;k<nfft;++k)
+            dst[k] = tmp[k].real();
+    }
+
     void inv(Complex * dst,const Complex  *src,int nfft)
     {
         prepare(nfft,true);
@@ -156,7 +151,7 @@ namespace Eigen {
                         default: p += 2; break;
                     }
                     if (p*p>n)
-                        p=n;// no more factors
+                        p=n;// impossible to have a factor > sqrt(n)
                 }
                 n /= p;
                 m_stageRadix.push_back(p);
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 41c7fed7b..75c33277d 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -28,6 +28,10 @@
 
 using namespace std;
 
+float norm(float x) {return x*x;}
+double norm(double x) {return x*x;}
+long double norm(long double x) {return x*x;}
+
 template < typename T>
 complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
 
@@ -83,7 +87,11 @@ void test_scalar(int nfft)
     for (int k=0;k<nfft;++k)
         inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
     fft.fwd( outbuf,inbuf);
-    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    vector<Scalar> buf3;
+    fft.inv( buf3 , outbuf);
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
 template <class T>
@@ -100,18 +108,18 @@ void test_complex(int nfft)
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
 
-    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
 
     fft.inv( buf3 , outbuf);
 
-    VERIFY( dif_rmse(inbuf,buf3) < 1e-5 );// gross check
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
 void test_FFT()
 {
-#if 0
+#if 1
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
-  CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
   CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
@@ -120,8 +128,9 @@ void test_FFT()
 #endif
 
 #if 1
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
-  CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 #endif
 }

From 210092d16c57ec2fd2f8f515151de284e8a737e3 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 25 May 2009 20:35:24 -0400
Subject: [PATCH 08/18] changed name from simple_fft_traits to ei_kissfft_impl

---
 bench/benchFFT.cpp                            |  2 +-
 unsupported/Eigen/{FFT.h => FFT}              | 18 ++++++------
 ...{simple_fft_traits.h => ei_kissfft_impl.h} | 29 +++++++++++++++++--
 unsupported/test/FFT.cpp                      |  3 +-
 4 files changed, 37 insertions(+), 15 deletions(-)
 rename unsupported/Eigen/{FFT.h => FFT} (86%)
 rename unsupported/Eigen/src/FFT/{simple_fft_traits.h => ei_kissfft_impl.h} (95%)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 84cc49fe3..ffa4ffffc 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -26,7 +26,7 @@
 #include <vector>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
-#include <unsupported/Eigen/FFT.h>
+#include <unsupported/Eigen/FFT>
 
 using namespace Eigen;
 using namespace std;
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT
similarity index 86%
rename from unsupported/Eigen/FFT.h
rename to unsupported/Eigen/FFT
index c466423b7..3d852f5a2 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT
@@ -25,28 +25,28 @@
 #ifndef EIGEN_FFT_H
 #define EIGEN_FFT_H
 
-// simple_fft_traits:  small, free, reasonably efficient default, derived from kissfft
-#include "src/FFT/simple_fft_traits.h"
-#define DEFAULT_FFT_TRAITS simple_fft_traits
+// ei_kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
+#include "src/FFT/ei_kissfft_impl.h"
+#define DEFAULT_FFT_IMPL ei_kissfft_impl
 
 // FFTW: faster, GPL-not LGPL, bigger code size
 #ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
 // TODO 
-// #include "src/FFT/fftw_traits.h"
-// #define DEFAULT_FFT_TRAITS fftw_traits
+// #include "src/FFT/ei_fftw_impl.h"
+// #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
 // intel Math Kernel Library: fastest, commerical
 #ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
 // TODO 
-// #include "src/FFT/imkl_traits.h"
-// #define DEFAULT_FFT_TRAITS imkl_traits
+// #include "src/FFT/ei_imkl_impl.h"
+// #define DEFAULT_FFT_IMPL ei_imkl_impl
 #endif
 
 namespace Eigen {
 
 template <typename _Scalar,
-         typename _Traits=DEFAULT_FFT_TRAITS<_Scalar> 
+         typename _Traits=DEFAULT_FFT_IMPL<_Scalar> 
          >
 class FFT
 {
@@ -90,6 +90,6 @@ class FFT
   private:
     traits_type m_traits;
 };
-#undef DEFAULT_FFT_TRAITS
+#undef DEFAULT_FFT_IMPL
 }
 #endif
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
similarity index 95%
rename from unsupported/Eigen/src/FFT/simple_fft_traits.h
rename to unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 1e2be8f79..ce2c9f16e 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -24,16 +24,15 @@
 
 #include <complex>
 #include <vector>
-#include <iostream>
 
 namespace Eigen {
 
   template <typename _Scalar>
-  struct simple_fft_traits
+  struct ei_kissfft_impl
   {
     typedef _Scalar Scalar;
     typedef std::complex<Scalar> Complex;
-    simple_fft_traits() : m_nfft(0) {} 
+    ei_kissfft_impl() : m_nfft(0) {} 
 
     template <typename _Src>
     void fwd( Complex * dst,const _Src *src,int nfft)
@@ -370,5 +369,29 @@ namespace Eigen {
     std::vector<Complex> m_realTwiddles;
     std::vector<int> m_stageRadix;
     std::vector<int> m_stageRemainder;
+/*
+    enum {FORWARD,INVERSE,REAL,COMPLEX};
+
+    struct PlanKey
+    {
+        PlanKey(int nfft,bool isinverse,bool iscomplex)
+        {
+            _key = (nfft<<2) | (isinverse<<1) | iscomplex;
+        }
+
+        bool operator<(const PlanKey & other) const
+        {
+            return this->_key < other._key;
+        }
+        int _key;
+    };
+
+    struct PlanData
+    {
+        std::vector<Complex> m_twiddles;
+    };
+
+    std::map<PlanKey,
+*/
   };
 }
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 75c33277d..daf397790 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -23,8 +23,7 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
-#include <unsupported/Eigen/FFT.h>
-
+#include <unsupported/Eigen/FFT>
 
 using namespace std;
 

From 03ed6f9bfb63879d475f5bb8ea46cff96063d010 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 25 May 2009 23:06:49 -0400
Subject: [PATCH 09/18] refactored ei_kissfft_impl to maintain a cache of cpx
 fft plans

---
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 612 ++++++++++----------
 unsupported/test/FFT.cpp                    |   7 +-
 2 files changed, 320 insertions(+), 299 deletions(-)

diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index ce2c9f16e..3580e6c61 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -24,21 +24,279 @@
 
 #include <complex>
 #include <vector>
+#include <map>
 
 namespace Eigen {
 
+    template <typename _Scalar>
+    struct ei_kiss_cpx_fft
+    {
+        typedef  _Scalar Scalar;
+        typedef  std::complex<Scalar> Complex;
+        std::vector<Complex> m_twiddles;
+        std::vector<int> m_stageRadix;
+        std::vector<int> m_stageRemainder;
+        bool m_inverse;
+
+        ei_kiss_cpx_fft() { }
+
+        void make_twiddles(int nfft,bool inverse)
+        {
+            m_inverse = inverse;
+            m_twiddles.resize(nfft);
+            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+            for (int i=0;i<nfft;++i)
+                m_twiddles[i] = exp( Complex(0,i*phinc) );
+        }
+
+        void invert()
+        {
+            m_inverse = !m_inverse;
+            for ( size_t i=0;i<m_twiddles.size() ;++i)
+                m_twiddles[i] = conj( m_twiddles[i] );
+        }
+
+        void factorize(int nfft)
+        {
+            if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
+            {
+                m_stageRadix.resize(0);
+                m_stageRemainder.resize(0);
+                //factorize
+                //start factoring out 4's, then 2's, then 3,5,7,9,...
+                int n= nfft;
+                int p=4;
+                do {
+                    while (n % p) {
+                        switch (p) {
+                            case 4: p = 2; break;
+                            case 2: p = 3; break;
+                            default: p += 2; break;
+                        }
+                        if (p*p>n)
+                            p=n;// impossible to have a factor > sqrt(n)
+                    }
+                    n /= p;
+                    m_stageRadix.push_back(p);
+                    m_stageRemainder.push_back(n);
+                }while(n>1);
+            }
+        }
+
+        template <typename _Src>
+            void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+            {
+                int p = m_stageRadix[stage];
+                int m = m_stageRemainder[stage];
+                Complex * Fout_beg = xout;
+                Complex * Fout_end = xout + p*m;
+
+                if (m>1) {
+                    do{
+                        // recursive call:
+                        // DFT of size m*p performed by doing
+                        // p instances of smaller DFTs of size m, 
+                        // each one takes a decimated version of the input
+                        work(stage+1, xout , xin, fstride*p,in_stride);
+                        xin += fstride*in_stride;
+                    }while( (xout += m) != Fout_end );
+                }else{
+                    do{
+                        *xout = *xin;
+                        xin += fstride*in_stride;
+                    }while(++xout != Fout_end );
+                }
+                xout=Fout_beg;
+
+                // recombine the p smaller DFTs 
+                switch (p) {
+                    case 2: bfly2(xout,fstride,m); break;
+                    case 3: bfly3(xout,fstride,m); break;
+                    case 4: bfly4(xout,fstride,m); break;
+                    case 5: bfly5(xout,fstride,m); break;
+                    default: bfly_generic(xout,fstride,m,p); break;
+                }
+            }
+
+        void bfly2( Complex * Fout, const size_t fstride, int m)
+        {
+            for (int k=0;k<m;++k) {
+                Complex t = Fout[m+k] * m_twiddles[k*fstride];
+                Fout[m+k] = Fout[k] - t;
+                Fout[k] += t;
+            }
+        }
+
+        void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+        {
+            Complex scratch[6];
+            int negative_if_inverse = m_inverse * -2 +1;
+            for (size_t k=0;k<m;++k) {
+                scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+                scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+                scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+                scratch[5] = Fout[k] - scratch[1];
+
+                Fout[k] += scratch[1];
+                scratch[3] = scratch[0] + scratch[2];
+                scratch[4] = scratch[0] - scratch[2];
+                scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+                Fout[k+2*m]  = Fout[k] - scratch[3];
+                Fout[k] += scratch[3];
+                Fout[k+m] = scratch[5] + scratch[4];
+                Fout[k+3*m] = scratch[5] - scratch[4];
+            }
+        }
+
+        void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+        {
+            size_t k=m;
+            const size_t m2 = 2*m;
+            Complex *tw1,*tw2;
+            Complex scratch[5];
+            Complex epi3;
+            epi3 = m_twiddles[fstride*m];
+
+            tw1=tw2=&m_twiddles[0];
+
+            do{
+                scratch[1]=Fout[m] * *tw1;
+                scratch[2]=Fout[m2] * *tw2;
+
+                scratch[3]=scratch[1]+scratch[2];
+                scratch[0]=scratch[1]-scratch[2];
+                tw1 += fstride;
+                tw2 += fstride*2;
+                Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+                scratch[0] *= epi3.imag();
+                *Fout += scratch[3];
+                Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+                Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+                ++Fout;
+            }while(--k);
+        }
+
+        void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+        {
+            Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+            size_t u;
+            Complex scratch[13];
+            Complex * twiddles = &m_twiddles[0];
+            Complex *tw;
+            Complex ya,yb;
+            ya = twiddles[fstride*m];
+            yb = twiddles[fstride*2*m];
+
+            Fout0=Fout;
+            Fout1=Fout0+m;
+            Fout2=Fout0+2*m;
+            Fout3=Fout0+3*m;
+            Fout4=Fout0+4*m;
+
+            tw=twiddles;
+            for ( u=0; u<m; ++u ) {
+                scratch[0] = *Fout0;
+
+                scratch[1]  = *Fout1 * tw[u*fstride];
+                scratch[2]  = *Fout2 * tw[2*u*fstride];
+                scratch[3]  = *Fout3 * tw[3*u*fstride];
+                scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+                scratch[7] = scratch[1] + scratch[4];
+                scratch[10] = scratch[1] - scratch[4];
+                scratch[8] = scratch[2] + scratch[3];
+                scratch[9] = scratch[2] - scratch[3];
+
+                *Fout0 +=  scratch[7];
+                *Fout0 +=  scratch[8];
+
+                scratch[5] = scratch[0] + Complex(
+                        (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+                        (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+                        );
+
+                scratch[6] = Complex(
+                        (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+                        -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+                        );
+
+                *Fout1 = scratch[5] - scratch[6];
+                *Fout4 = scratch[5] + scratch[6];
+
+                scratch[11] = scratch[0] +
+                    Complex(
+                            (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+                            (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+                           );
+
+                scratch[12] = Complex(
+                        -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+                        (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+                        );
+
+                *Fout2=scratch[11]+scratch[12];
+                *Fout3=scratch[11]-scratch[12];
+
+                ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+            }
+        }
+
+        /* perform the butterfly for one stage of a mixed radix FFT */
+        void bfly_generic(
+                Complex * Fout,
+                const size_t fstride,
+                int m,
+                int p
+                )
+        {
+            int u,k,q1,q;
+            Complex * twiddles = &m_twiddles[0];
+            Complex t;
+            int Norig = m_twiddles.size();
+            Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+            for ( u=0; u<m; ++u ) {
+                k=u;
+                for ( q1=0 ; q1<p ; ++q1 ) {
+                    scratchbuf[q1] = Fout[ k  ];
+                    k += m;
+                }
+
+                k=u;
+                for ( q1=0 ; q1<p ; ++q1 ) {
+                    int twidx=0;
+                    Fout[ k ] = scratchbuf[0];
+                    for (q=1;q<p;++q ) {
+                        twidx += fstride * k;
+                        if (twidx>=Norig) twidx-=Norig;
+                        t=scratchbuf[q] * twiddles[twidx];
+                        Fout[ k ] += t;
+                    }
+                    k += m;
+                }
+            }
+        }
+    };
+
+
   template <typename _Scalar>
   struct ei_kissfft_impl
   {
     typedef _Scalar Scalar;
     typedef std::complex<Scalar> Complex;
-    ei_kissfft_impl() : m_nfft(0) {} 
+    ei_kissfft_impl() {} 
+
+    void clear() 
+    {
+        m_plans.clear();
+        m_realTwiddles.clear();
+    }
 
     template <typename _Src>
     void fwd( Complex * dst,const _Src *src,int nfft)
     {
-        prepare(nfft,false);
-        work(0, dst, src, 1,1);
+        get_plan(nfft,false).work(0, dst, src, 1,1);
     }
 
     // real-to-complex forward FFT
@@ -47,16 +305,16 @@ namespace Eigen {
     // then fill in the conjugate symmetric half
     void fwd( Complex * dst,const Scalar * src,int nfft) 
     {
-        if ( nfft&1 ) {
+        if ( nfft&3  ) {
             // use generic mode for odd
-            prepare(nfft,false);
-            work(0, dst, src, 1,1);
+            get_plan(nfft,false).work(0, dst, src, 1,1);
         }else{
             int ncfft = nfft>>1;
             int ncfft2 = nfft>>2;
+            Complex * rtw = real_twiddles(ncfft2);
+
             // use optimized mode for even real
-            fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
-            make_real_twiddles(nfft);
+            fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
             int k;
@@ -65,8 +323,7 @@ namespace Eigen {
                 Complex fpnk = conj(dst[ncfft-k]);
                 Complex f1k = fpk + fpnk;
                 Complex f2k = fpk - fpnk;
-                //Complex tw = f2k * exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-                Complex tw= f2k * m_realTwiddles[k-1];
+                Complex tw= f2k * rtw[k-1];
 
                 dst[k] =  (f1k + tw) * Scalar(.5);
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
@@ -94,304 +351,67 @@ namespace Eigen {
 
     void inv(Complex * dst,const Complex  *src,int nfft)
     {
-        prepare(nfft,true);
-        work(0, dst, src, 1,1);
-        scale(dst, Scalar(1)/m_nfft );
-    }
-
-    void prepare(int nfft,bool inverse)
-    {
-        make_twiddles(nfft,inverse);
-        factorize(nfft);
-    }
-
-    void make_real_twiddles(int nfft)
-    {
-        int ncfft2 = nfft>>2;
-        if ( m_realTwiddles.size() != ncfft2) {
-            m_realTwiddles.resize(ncfft2);
-            int ncfft= nfft>>1;
-            for (int k=1;k<=ncfft2;++k) 
-                m_realTwiddles[k-1] = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-        }
-    }
-
-    void make_twiddles(int nfft,bool inverse)
-    {
-        if ( m_twiddles.size() == nfft) {
-            // reuse the twiddles, conjugate if necessary
-            if (inverse != m_inverse)
-                for (int i=0;i<nfft;++i)
-                    m_twiddles[i] = conj( m_twiddles[i] );
-        }else{
-            m_twiddles.resize(nfft);
-            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-            for (int i=0;i<nfft;++i)
-                m_twiddles[i] = exp( Complex(0,i*phinc) );
-        }
-        m_inverse = inverse;
-    }
-
-    void factorize(int nfft)
-    {
-        if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
-        {
-            m_stageRadix.resize(0);
-            m_stageRemainder.resize(0);
-            //factorize
-            //start factoring out 4's, then 2's, then 3,5,7,9,...
-            int n= nfft;
-            int p=4;
-            do {
-                while (n % p) {
-                    switch (p) {
-                        case 4: p = 2; break;
-                        case 2: p = 3; break;
-                        default: p += 2; break;
-                    }
-                    if (p*p>n)
-                        p=n;// impossible to have a factor > sqrt(n)
-                }
-                n /= p;
-                m_stageRadix.push_back(p);
-                m_stageRemainder.push_back(n);
-            }while(n>1);
-        }
-        m_nfft = nfft;
-    }
-
-    void scale(Complex *dst,Scalar s) 
-    {
-        for (int k=0;k<m_nfft;++k)
-            dst[k] *= s;
+        get_plan(nfft,true).work(0, dst, src, 1,1);
+        scale(dst, nfft, Scalar(1)/nfft );
     }
 
     private:
 
-    template <typename _Src>
-    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+    typedef ei_kiss_cpx_fft<Scalar> PlanData;
+
+    typedef std::map<int,PlanData> PlanMap;
+    PlanMap m_plans;
+    std::map<int, std::vector<Complex> > m_realTwiddles;
+
+    int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
+
+    PlanData & get_plan(int nfft,bool inverse)
     {
-      int p = m_stageRadix[stage];
-      int m = m_stageRemainder[stage];
-      Complex * Fout_beg = xout;
-      Complex * Fout_end = xout + p*m;
-
-      if (m>1) {
-        do{
-          // recursive call:
-          // DFT of size m*p performed by doing
-          // p instances of smaller DFTs of size m, 
-          // each one takes a decimated version of the input
-          work(stage+1, xout , xin, fstride*p,in_stride);
-          xin += fstride*in_stride;
-        }while( (xout += m) != Fout_end );
-      }else{
-          do{
-              *xout = *xin;
-              xin += fstride*in_stride;
-          }while(++xout != Fout_end );
-      }
-      xout=Fout_beg;
-
-      // recombine the p smaller DFTs 
-      switch (p) {
-        case 2: bfly2(xout,fstride,m); break;
-        case 3: bfly3(xout,fstride,m); break;
-        case 4: bfly4(xout,fstride,m); break;
-        case 5: bfly5(xout,fstride,m); break;
-        default: bfly_generic(xout,fstride,m,p); break;
-      }
-    }
-
-    void bfly2( Complex * Fout, const size_t fstride, int m)
-    {
-      for (int k=0;k<m;++k) {
-        Complex t = Fout[m+k] * m_twiddles[k*fstride];
-        Fout[m+k] = Fout[k] - t;
-        Fout[k] += t;
-      }
-    }
-
-    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex scratch[6];
-      int negative_if_inverse = m_inverse * -2 +1;
-      for (size_t k=0;k<m;++k) {
-        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-        scratch[5] = Fout[k] - scratch[1];
-
-        Fout[k] += scratch[1];
-        scratch[3] = scratch[0] + scratch[2];
-        scratch[4] = scratch[0] - scratch[2];
-        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
-
-        Fout[k+2*m]  = Fout[k] - scratch[3];
-        Fout[k] += scratch[3];
-        Fout[k+m] = scratch[5] + scratch[4];
-        Fout[k+3*m] = scratch[5] - scratch[4];
-      }
-    }
-
-    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      size_t k=m;
-      const size_t m2 = 2*m;
-      Complex *tw1,*tw2;
-      Complex scratch[5];
-      Complex epi3;
-      epi3 = m_twiddles[fstride*m];
-
-      tw1=tw2=&m_twiddles[0];
-
-      do{
-        scratch[1]=Fout[m] * *tw1;
-        scratch[2]=Fout[m2] * *tw2;
-
-        scratch[3]=scratch[1]+scratch[2];
-        scratch[0]=scratch[1]-scratch[2];
-        tw1 += fstride;
-        tw2 += fstride*2;
-        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-        scratch[0] *= epi3.imag();
-        *Fout += scratch[3];
-        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-        ++Fout;
-      }while(--k);
-    }
-
-    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-      size_t u;
-      Complex scratch[13];
-      Complex * twiddles = &m_twiddles[0];
-      Complex *tw;
-      Complex ya,yb;
-      ya = twiddles[fstride*m];
-      yb = twiddles[fstride*2*m];
-
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      tw=twiddles;
-      for ( u=0; u<m; ++u ) {
-        scratch[0] = *Fout0;
-
-        scratch[1]  = *Fout1 * tw[u*fstride];
-        scratch[2]  = *Fout2 * tw[2*u*fstride];
-        scratch[3]  = *Fout3 * tw[3*u*fstride];
-        scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-        scratch[7] = scratch[1] + scratch[4];
-        scratch[10] = scratch[1] - scratch[4];
-        scratch[8] = scratch[2] + scratch[3];
-        scratch[9] = scratch[2] - scratch[3];
-
-        *Fout0 +=  scratch[7];
-        *Fout0 +=  scratch[8];
-
-        scratch[5] = scratch[0] + Complex(
-            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-            );
-
-        scratch[6] = Complex(
-            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-            );
-
-        *Fout1 = scratch[5] - scratch[6];
-        *Fout4 = scratch[5] + scratch[6];
-
-        scratch[11] = scratch[0] +
-          Complex(
-              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
-              );
-
-        scratch[12] = Complex(
-            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-            );
-
-        *Fout2=scratch[11]+scratch[12];
-        *Fout3=scratch[11]-scratch[12];
-
-        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-    }
-
-    /* perform the butterfly for one stage of a mixed radix FFT */
-    void bfly_generic(
-        Complex * Fout,
-        const size_t fstride,
-        int m,
-        int p
-        )
-    {
-      int u,k,q1,q;
-      Complex * twiddles = &m_twiddles[0];
-      Complex t;
-      int Norig = m_nfft;
-      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
-
-      for ( u=0; u<m; ++u ) {
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          scratchbuf[q1] = Fout[ k  ];
-          k += m;
+        /* 
+         * for some reason this does not work
+         *
+        typedef typename std::map<int,PlanData>::iterator MapIt;
+        MapIt it;
+        it = m_plans.find( PlanKey(nfft,inverse) );
+        if (it == m_plans.end() ) {
+            // create new entry
+            it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
+            MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
+            if (it2 != m_plans.end() ) {
+                it->second = it2.second;
+                it->second.invert();
+            }else{
+                it->second.make_twiddles(nfft,inverse);
+                it->second.factorize(nfft);
+            }
         }
-
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          int twidx=0;
-          Fout[ k ] = scratchbuf[0];
-          for (q=1;q<p;++q ) {
-            twidx += fstride * k;
-            if (twidx>=Norig) twidx-=Norig;
-            t=scratchbuf[q] * twiddles[twidx];
-            Fout[ k ] += t;
-          }
-          k += m;
+        return it->second;
+        */
+        PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
+        if ( pd.m_twiddles.size() == 0 ) {
+            pd.make_twiddles(nfft,inverse);
+            pd.factorize(nfft);
         }
-      }
+        return pd;
     }
 
-    int m_nfft;
-    bool m_inverse;
-    std::vector<Complex> m_twiddles;
-    std::vector<Complex> m_realTwiddles;
-    std::vector<int> m_stageRadix;
-    std::vector<int> m_stageRemainder;
-/*
-    enum {FORWARD,INVERSE,REAL,COMPLEX};
-
-    struct PlanKey
+    Complex * real_twiddles(int ncfft2)
     {
-        PlanKey(int nfft,bool isinverse,bool iscomplex)
-        {
-            _key = (nfft<<2) | (isinverse<<1) | iscomplex;
+        std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
+        if ( (int)twidref.size() != ncfft2 ) {
+            twidref.resize(ncfft2);
+            int ncfft= ncfft2<<1;
+            Scalar pi =  acos( Scalar(-1) );
+            for (int k=1;k<=ncfft2;++k) 
+                twidref[k-1] = exp( Complex(0,-pi * ((double) (k) / ncfft + .5) ) );
         }
+        return &twidref[0];
+    }
 
-        bool operator<(const PlanKey & other) const
-        {
-            return this->_key < other._key;
-        }
-        int _key;
-    };
-
-    struct PlanData
+    void scale(Complex *dst,int n,Scalar s) 
     {
-        std::vector<Complex> m_twiddles;
-    };
-
-    std::map<PlanKey,
-*/
+        for (int k=0;k<n;++k)
+            dst[k] *= s;
+    }
   };
 }
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index daf397790..32d1393d0 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -44,7 +44,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
     {
         long double totalpower=0;
         long double difpower=0;
-        cerr <<"idx\ttruth\t\tvalue\n";
+        cerr <<"idx\ttruth\t\tvalue\t|dif|=\n";
         for (size_t k0=0;k0<fftbuf.size();++k0) {
             complex<long double> acc = 0;
             long double phinc = -2.*k0* M_PIl / timebuf.size();
@@ -55,7 +55,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
             complex<long double> x = promote(fftbuf[k0]); 
             complex<long double> dif = acc - x;
             difpower += norm(dif);
-            cerr << k0 << "\t" << acc << "\t" <<  x << endl;
+            cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(norm(dif)) << endl;
         }
         cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
         return sqrt(difpower/totalpower);
@@ -127,8 +127,9 @@ void test_FFT()
 #endif
 
 #if 1
-  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
   CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 #endif

From 09b47332553a79dab30516e6b1d410dea90cf9b7 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 25 May 2009 23:52:21 -0400
Subject: [PATCH 10/18] added real-optimized inverse FFT (NFFT must be multiple
 of 4)

---
 bench/benchFFT.cpp                          |  30 +-
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 688 ++++++++++----------
 2 files changed, 370 insertions(+), 348 deletions(-)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index ffa4ffffc..14f5063fb 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -53,7 +53,7 @@ template <> string nameof<long double>() {return "long double";}
 using namespace Eigen;
 
 template <typename T>
-void bench(int nfft)
+void bench(int nfft,bool fwd)
 {
     typedef typename NumTraits<T>::Real Scalar;
     typedef typename std::complex<Scalar> Complex;
@@ -69,7 +69,10 @@ void bench(int nfft)
     for (int k=0;k<8;++k) {
         timer.start();
         for(int i = 0; i < nits; i++)
-            fft.fwd( outbuf , inbuf);
+            if (fwd)
+                fft.fwd( outbuf , inbuf);
+            else
+                fft.inv(inbuf,outbuf);
         timer.stop();
     }
 
@@ -82,16 +85,27 @@ void bench(int nfft)
         mflops /= 2;
     }
 
+    if (fwd)
+        cout << " fwd";
+    else
+        cout << " inv";
+
     cout << " NFFT=" << nfft << "  " << (double(1e-6*nfft*nits)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
 }
 
 int main(int argc,char ** argv)
 {
-    bench<complex<float> >(NFFT);
-    bench<float>(NFFT);
-    bench<complex<double> >(NFFT);
-    bench<double>(NFFT);
-    bench<complex<long double> >(NFFT);
-    bench<long double>(NFFT);
+    bench<complex<float> >(NFFT,true);
+    bench<complex<float> >(NFFT,false);
+    bench<float>(NFFT,true);
+    bench<float>(NFFT,false);
+    bench<complex<double> >(NFFT,true);
+    bench<complex<double> >(NFFT,false);
+    bench<double>(NFFT,true);
+    bench<double>(NFFT,false);
+    bench<complex<long double> >(NFFT,true);
+    bench<complex<long double> >(NFFT,false);
+    bench<long double>(NFFT,true);
+    bench<long double>(NFFT,false);
     return 0;
 }
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 3580e6c61..453c7f6da 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -28,390 +28,398 @@
 
 namespace Eigen {
 
-    template <typename _Scalar>
-    struct ei_kiss_cpx_fft
+  template <typename _Scalar>
+  struct ei_kiss_cpx_fft
+  {
+    typedef  _Scalar Scalar;
+    typedef  std::complex<Scalar> Complex;
+    std::vector<Complex> m_twiddles;
+    std::vector<int> m_stageRadix;
+    std::vector<int> m_stageRemainder;
+    bool m_inverse;
+
+    void make_twiddles(int nfft,bool inverse)
     {
-        typedef  _Scalar Scalar;
-        typedef  std::complex<Scalar> Complex;
-        std::vector<Complex> m_twiddles;
-        std::vector<int> m_stageRadix;
-        std::vector<int> m_stageRemainder;
-        bool m_inverse;
+      m_inverse = inverse;
+      m_twiddles.resize(nfft);
+      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+      for (int i=0;i<nfft;++i)
+        m_twiddles[i] = exp( Complex(0,i*phinc) );
+    }
 
-        ei_kiss_cpx_fft() { }
+    void conjugate()
+    {
+      m_inverse = !m_inverse;
+      for ( size_t i=0;i<m_twiddles.size() ;++i)
+        m_twiddles[i] = conj( m_twiddles[i] );
+    }
 
-        void make_twiddles(int nfft,bool inverse)
-        {
-            m_inverse = inverse;
-            m_twiddles.resize(nfft);
-            Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-            for (int i=0;i<nfft;++i)
-                m_twiddles[i] = exp( Complex(0,i*phinc) );
+    void factorize(int nfft)
+    {
+      //start factoring out 4's, then 2's, then 3,5,7,9,...
+      int n= nfft;
+      int p=4;
+      do {
+        while (n % p) {
+          switch (p) {
+            case 4: p = 2; break;
+            case 2: p = 3; break;
+            default: p += 2; break;
+          }
+          if (p*p>n)
+            p=n;// impossible to have a factor > sqrt(n)
+        }
+        n /= p;
+        m_stageRadix.push_back(p);
+        m_stageRemainder.push_back(n);
+      }while(n>1);
+    }
+
+    template <typename _Src>
+    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+    {
+      int p = m_stageRadix[stage];
+      int m = m_stageRemainder[stage];
+      Complex * Fout_beg = xout;
+      Complex * Fout_end = xout + p*m;
+
+      if (m>1) {
+        do{
+          // recursive call:
+          // DFT of size m*p performed by doing
+          // p instances of smaller DFTs of size m, 
+          // each one takes a decimated version of the input
+          work(stage+1, xout , xin, fstride*p,in_stride);
+          xin += fstride*in_stride;
+        }while( (xout += m) != Fout_end );
+      }else{
+        do{
+          *xout = *xin;
+          xin += fstride*in_stride;
+        }while(++xout != Fout_end );
+      }
+      xout=Fout_beg;
+
+      // recombine the p smaller DFTs 
+      switch (p) {
+        case 2: bfly2(xout,fstride,m); break;
+        case 3: bfly3(xout,fstride,m); break;
+        case 4: bfly4(xout,fstride,m); break;
+        case 5: bfly5(xout,fstride,m); break;
+        default: bfly_generic(xout,fstride,m,p); break;
+      }
+    }
+
+    void bfly2( Complex * Fout, const size_t fstride, int m)
+    {
+      for (int k=0;k<m;++k) {
+        Complex t = Fout[m+k] * m_twiddles[k*fstride];
+        Fout[m+k] = Fout[k] - t;
+        Fout[k] += t;
+      }
+    }
+
+    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex scratch[6];
+      int negative_if_inverse = m_inverse * -2 +1;
+      for (size_t k=0;k<m;++k) {
+        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+        scratch[5] = Fout[k] - scratch[1];
+
+        Fout[k] += scratch[1];
+        scratch[3] = scratch[0] + scratch[2];
+        scratch[4] = scratch[0] - scratch[2];
+        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+        Fout[k+2*m]  = Fout[k] - scratch[3];
+        Fout[k] += scratch[3];
+        Fout[k+m] = scratch[5] + scratch[4];
+        Fout[k+3*m] = scratch[5] - scratch[4];
+      }
+    }
+
+    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      size_t k=m;
+      const size_t m2 = 2*m;
+      Complex *tw1,*tw2;
+      Complex scratch[5];
+      Complex epi3;
+      epi3 = m_twiddles[fstride*m];
+
+      tw1=tw2=&m_twiddles[0];
+
+      do{
+        scratch[1]=Fout[m] * *tw1;
+        scratch[2]=Fout[m2] * *tw2;
+
+        scratch[3]=scratch[1]+scratch[2];
+        scratch[0]=scratch[1]-scratch[2];
+        tw1 += fstride;
+        tw2 += fstride*2;
+        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+        scratch[0] *= epi3.imag();
+        *Fout += scratch[3];
+        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+        ++Fout;
+      }while(--k);
+    }
+
+    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+    {
+      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+      size_t u;
+      Complex scratch[13];
+      Complex * twiddles = &m_twiddles[0];
+      Complex *tw;
+      Complex ya,yb;
+      ya = twiddles[fstride*m];
+      yb = twiddles[fstride*2*m];
+
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      tw=twiddles;
+      for ( u=0; u<m; ++u ) {
+        scratch[0] = *Fout0;
+
+        scratch[1]  = *Fout1 * tw[u*fstride];
+        scratch[2]  = *Fout2 * tw[2*u*fstride];
+        scratch[3]  = *Fout3 * tw[3*u*fstride];
+        scratch[4]  = *Fout4 * tw[4*u*fstride];
+
+        scratch[7] = scratch[1] + scratch[4];
+        scratch[10] = scratch[1] - scratch[4];
+        scratch[8] = scratch[2] + scratch[3];
+        scratch[9] = scratch[2] - scratch[3];
+
+        *Fout0 +=  scratch[7];
+        *Fout0 +=  scratch[8];
+
+        scratch[5] = scratch[0] + Complex(
+            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+            );
+
+        scratch[6] = Complex(
+            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+            );
+
+        *Fout1 = scratch[5] - scratch[6];
+        *Fout4 = scratch[5] + scratch[6];
+
+        scratch[11] = scratch[0] +
+          Complex(
+              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+              );
+
+        scratch[12] = Complex(
+            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+            );
+
+        *Fout2=scratch[11]+scratch[12];
+        *Fout3=scratch[11]-scratch[12];
+
+        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+    }
+
+    /* perform the butterfly for one stage of a mixed radix FFT */
+    void bfly_generic(
+        Complex * Fout,
+        const size_t fstride,
+        int m,
+        int p
+        )
+    {
+      int u,k,q1,q;
+      Complex * twiddles = &m_twiddles[0];
+      Complex t;
+      int Norig = m_twiddles.size();
+      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+      for ( u=0; u<m; ++u ) {
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          scratchbuf[q1] = Fout[ k  ];
+          k += m;
         }
 
-        void invert()
-        {
-            m_inverse = !m_inverse;
-            for ( size_t i=0;i<m_twiddles.size() ;++i)
-                m_twiddles[i] = conj( m_twiddles[i] );
+        k=u;
+        for ( q1=0 ; q1<p ; ++q1 ) {
+          int twidx=0;
+          Fout[ k ] = scratchbuf[0];
+          for (q=1;q<p;++q ) {
+            twidx += fstride * k;
+            if (twidx>=Norig) twidx-=Norig;
+            t=scratchbuf[q] * twiddles[twidx];
+            Fout[ k ] += t;
+          }
+          k += m;
         }
-
-        void factorize(int nfft)
-        {
-            if (m_stageRadix.size()==0 || m_stageRadix[0] * m_stageRemainder[0] != nfft)
-            {
-                m_stageRadix.resize(0);
-                m_stageRemainder.resize(0);
-                //factorize
-                //start factoring out 4's, then 2's, then 3,5,7,9,...
-                int n= nfft;
-                int p=4;
-                do {
-                    while (n % p) {
-                        switch (p) {
-                            case 4: p = 2; break;
-                            case 2: p = 3; break;
-                            default: p += 2; break;
-                        }
-                        if (p*p>n)
-                            p=n;// impossible to have a factor > sqrt(n)
-                    }
-                    n /= p;
-                    m_stageRadix.push_back(p);
-                    m_stageRemainder.push_back(n);
-                }while(n>1);
-            }
-        }
-
-        template <typename _Src>
-            void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
-            {
-                int p = m_stageRadix[stage];
-                int m = m_stageRemainder[stage];
-                Complex * Fout_beg = xout;
-                Complex * Fout_end = xout + p*m;
-
-                if (m>1) {
-                    do{
-                        // recursive call:
-                        // DFT of size m*p performed by doing
-                        // p instances of smaller DFTs of size m, 
-                        // each one takes a decimated version of the input
-                        work(stage+1, xout , xin, fstride*p,in_stride);
-                        xin += fstride*in_stride;
-                    }while( (xout += m) != Fout_end );
-                }else{
-                    do{
-                        *xout = *xin;
-                        xin += fstride*in_stride;
-                    }while(++xout != Fout_end );
-                }
-                xout=Fout_beg;
-
-                // recombine the p smaller DFTs 
-                switch (p) {
-                    case 2: bfly2(xout,fstride,m); break;
-                    case 3: bfly3(xout,fstride,m); break;
-                    case 4: bfly4(xout,fstride,m); break;
-                    case 5: bfly5(xout,fstride,m); break;
-                    default: bfly_generic(xout,fstride,m,p); break;
-                }
-            }
-
-        void bfly2( Complex * Fout, const size_t fstride, int m)
-        {
-            for (int k=0;k<m;++k) {
-                Complex t = Fout[m+k] * m_twiddles[k*fstride];
-                Fout[m+k] = Fout[k] - t;
-                Fout[k] += t;
-            }
-        }
-
-        void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-        {
-            Complex scratch[6];
-            int negative_if_inverse = m_inverse * -2 +1;
-            for (size_t k=0;k<m;++k) {
-                scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-                scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-                scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-                scratch[5] = Fout[k] - scratch[1];
-
-                Fout[k] += scratch[1];
-                scratch[3] = scratch[0] + scratch[2];
-                scratch[4] = scratch[0] - scratch[2];
-                scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
-
-                Fout[k+2*m]  = Fout[k] - scratch[3];
-                Fout[k] += scratch[3];
-                Fout[k+m] = scratch[5] + scratch[4];
-                Fout[k+3*m] = scratch[5] - scratch[4];
-            }
-        }
-
-        void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-        {
-            size_t k=m;
-            const size_t m2 = 2*m;
-            Complex *tw1,*tw2;
-            Complex scratch[5];
-            Complex epi3;
-            epi3 = m_twiddles[fstride*m];
-
-            tw1=tw2=&m_twiddles[0];
-
-            do{
-                scratch[1]=Fout[m] * *tw1;
-                scratch[2]=Fout[m2] * *tw2;
-
-                scratch[3]=scratch[1]+scratch[2];
-                scratch[0]=scratch[1]-scratch[2];
-                tw1 += fstride;
-                tw2 += fstride*2;
-                Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-                scratch[0] *= epi3.imag();
-                *Fout += scratch[3];
-                Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-                Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-                ++Fout;
-            }while(--k);
-        }
-
-        void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-        {
-            Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-            size_t u;
-            Complex scratch[13];
-            Complex * twiddles = &m_twiddles[0];
-            Complex *tw;
-            Complex ya,yb;
-            ya = twiddles[fstride*m];
-            yb = twiddles[fstride*2*m];
-
-            Fout0=Fout;
-            Fout1=Fout0+m;
-            Fout2=Fout0+2*m;
-            Fout3=Fout0+3*m;
-            Fout4=Fout0+4*m;
-
-            tw=twiddles;
-            for ( u=0; u<m; ++u ) {
-                scratch[0] = *Fout0;
-
-                scratch[1]  = *Fout1 * tw[u*fstride];
-                scratch[2]  = *Fout2 * tw[2*u*fstride];
-                scratch[3]  = *Fout3 * tw[3*u*fstride];
-                scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-                scratch[7] = scratch[1] + scratch[4];
-                scratch[10] = scratch[1] - scratch[4];
-                scratch[8] = scratch[2] + scratch[3];
-                scratch[9] = scratch[2] - scratch[3];
-
-                *Fout0 +=  scratch[7];
-                *Fout0 +=  scratch[8];
-
-                scratch[5] = scratch[0] + Complex(
-                        (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-                        (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-                        );
-
-                scratch[6] = Complex(
-                        (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-                        -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-                        );
-
-                *Fout1 = scratch[5] - scratch[6];
-                *Fout4 = scratch[5] + scratch[6];
-
-                scratch[11] = scratch[0] +
-                    Complex(
-                            (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-                            (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
-                           );
-
-                scratch[12] = Complex(
-                        -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-                        (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-                        );
-
-                *Fout2=scratch[11]+scratch[12];
-                *Fout3=scratch[11]-scratch[12];
-
-                ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-            }
-        }
-
-        /* perform the butterfly for one stage of a mixed radix FFT */
-        void bfly_generic(
-                Complex * Fout,
-                const size_t fstride,
-                int m,
-                int p
-                )
-        {
-            int u,k,q1,q;
-            Complex * twiddles = &m_twiddles[0];
-            Complex t;
-            int Norig = m_twiddles.size();
-            Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
-
-            for ( u=0; u<m; ++u ) {
-                k=u;
-                for ( q1=0 ; q1<p ; ++q1 ) {
-                    scratchbuf[q1] = Fout[ k  ];
-                    k += m;
-                }
-
-                k=u;
-                for ( q1=0 ; q1<p ; ++q1 ) {
-                    int twidx=0;
-                    Fout[ k ] = scratchbuf[0];
-                    for (q=1;q<p;++q ) {
-                        twidx += fstride * k;
-                        if (twidx>=Norig) twidx-=Norig;
-                        t=scratchbuf[q] * twiddles[twidx];
-                        Fout[ k ] += t;
-                    }
-                    k += m;
-                }
-            }
-        }
-    };
-
+      }
+    }
+  };
 
   template <typename _Scalar>
   struct ei_kissfft_impl
   {
-    typedef _Scalar Scalar;
-    typedef std::complex<Scalar> Complex;
-    ei_kissfft_impl() {} 
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
 
-    void clear() 
-    {
+      void clear() 
+      {
         m_plans.clear();
         m_realTwiddles.clear();
-    }
+      }
 
-    template <typename _Src>
-    void fwd( Complex * dst,const _Src *src,int nfft)
-    {
+      template <typename _Src>
+      void fwd( Complex * dst,const _Src *src,int nfft)
+      {
         get_plan(nfft,false).work(0, dst, src, 1,1);
-    }
+      }
 
-    // real-to-complex forward FFT
-    // perform two FFTs of src even and src odd
-    // then twiddle to recombine them into the half-spectrum format
-    // then fill in the conjugate symmetric half
-    void fwd( Complex * dst,const Scalar * src,int nfft) 
-    {
+      // real-to-complex forward FFT
+      // perform two FFTs of src even and src odd
+      // then twiddle to recombine them into the half-spectrum format
+      // then fill in the conjugate symmetric half
+      void fwd( Complex * dst,const Scalar * src,int nfft) 
+      {
         if ( nfft&3  ) {
-            // use generic mode for odd
-            get_plan(nfft,false).work(0, dst, src, 1,1);
+          // use generic mode for odd
+          get_plan(nfft,false).work(0, dst, src, 1,1);
         }else{
-            int ncfft = nfft>>1;
-            int ncfft2 = nfft>>2;
-            Complex * rtw = real_twiddles(ncfft2);
+          int ncfft = nfft>>1;
+          int ncfft2 = nfft>>2;
+          Complex * rtw = real_twiddles(ncfft2);
 
-            // use optimized mode for even real
-            fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-            Complex dc = dst[0].real() +  dst[0].imag();
-            Complex nyquist = dst[0].real() -  dst[0].imag();
-            int k;
-            for ( k=1;k <= ncfft2 ; ++k ) {
-                Complex fpk = dst[k];
-                Complex fpnk = conj(dst[ncfft-k]);
-                Complex f1k = fpk + fpnk;
-                Complex f2k = fpk - fpnk;
-                Complex tw= f2k * rtw[k-1];
+          // use optimized mode for even real
+          fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
+          Complex dc = dst[0].real() +  dst[0].imag();
+          Complex nyquist = dst[0].real() -  dst[0].imag();
+          int k;
+          for ( k=1;k <= ncfft2 ; ++k ) {
+            Complex fpk = dst[k];
+            Complex fpnk = conj(dst[ncfft-k]);
+            Complex f1k = fpk + fpnk;
+            Complex f2k = fpk - fpnk;
+            Complex tw= f2k * rtw[k-1];
+            dst[k] =  (f1k + tw) * Scalar(.5);
+            dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
+          }
 
-                dst[k] =  (f1k + tw) * Scalar(.5);
-                dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
-            }
- 
-            // place conjugate-symmetric half at the end for completeness
-            // TODO: make this configurable ( opt-out )
-            for ( k=1;k < ncfft ; ++k )
-                dst[nfft-k] = conj(dst[k]);
-
-            dst[0] = dc;
-            dst[ncfft] = nyquist;
+          // place conjugate-symmetric half at the end for completeness
+          // TODO: make this configurable ( opt-out )
+          for ( k=1;k < ncfft ; ++k )
+            dst[nfft-k] = conj(dst[k]);
+          dst[0] = dc;
+          dst[ncfft] = nyquist;
         }
-    }
+      }
 
-    // half-complex to scalar
-    void inv( Scalar * dst,const Complex * src,int nfft) 
-    {
-        // TODO add optimized version for even numbers
-        std::vector<Complex> tmp(nfft);
-        inv(&tmp[0],src,nfft);
-        for (int k=0;k<nfft;++k)
-            dst[k] = tmp[k].real();
-    }
-
-    void inv(Complex * dst,const Complex  *src,int nfft)
-    {
+      // inverse complex-to-complex
+      void inv(Complex * dst,const Complex  *src,int nfft)
+      {
         get_plan(nfft,true).work(0, dst, src, 1,1);
         scale(dst, nfft, Scalar(1)/nfft );
-    }
+      }
 
-    private:
+      // half-complex to scalar
+      void inv( Scalar * dst,const Complex * src,int nfft) 
+      {
+        if (nfft&3) {
+          m_scratchBuf.resize(nfft);
+          inv(&m_scratchBuf[0],src,nfft);
+          for (int k=0;k<nfft;++k)
+            dst[k] = m_scratchBuf[k].real();
+        }else{
+          // optimized version for multiple of 4
+          int ncfft = nfft>>1;
+          int ncfft2 = nfft>>2;
+          Complex * rtw = real_twiddles(ncfft2);
+          m_scratchBuf.resize(ncfft);
+          m_scratchBuf[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
+          for (int k = 1; k <= ncfft / 2; ++k) {
+            Complex fk = src[k];
+            Complex fnkc = conj(src[ncfft-k]);
+            Complex fek = fk + fnkc;
+            Complex tmp = fk - fnkc;
+            Complex fok = tmp * conj(rtw[k-1]);
+            m_scratchBuf[k] = fek + fok;
+            m_scratchBuf[ncfft-k] = conj(fek - fok);
+          }
+          scale(&m_scratchBuf[0], ncfft, Scalar(1)/nfft );
+          get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_scratchBuf[0], 1,1);
+        }
+      }
 
-    typedef ei_kiss_cpx_fft<Scalar> PlanData;
+  private:
 
-    typedef std::map<int,PlanData> PlanMap;
-    PlanMap m_plans;
-    std::map<int, std::vector<Complex> > m_realTwiddles;
+      typedef ei_kiss_cpx_fft<Scalar> PlanData;
 
-    int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
+      typedef std::map<int,PlanData> PlanMap;
+      PlanMap m_plans;
+      std::map<int, std::vector<Complex> > m_realTwiddles;
+      std::vector<Complex> m_scratchBuf;
 
-    PlanData & get_plan(int nfft,bool inverse)
-    {
-        /* 
+      int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
+
+      PlanData & get_plan(int nfft,bool inverse)
+      {
+        /*  TODO: figure out why this does not work (g++ 4.3.2)
          * for some reason this does not work
          *
-        typedef typename std::map<int,PlanData>::iterator MapIt;
-        MapIt it;
-        it = m_plans.find( PlanKey(nfft,inverse) );
-        if (it == m_plans.end() ) {
-            // create new entry
-            it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
-            MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
-            if (it2 != m_plans.end() ) {
-                it->second = it2.second;
-                it->second.invert();
-            }else{
-                it->second.make_twiddles(nfft,inverse);
-                it->second.factorize(nfft);
-            }
+         PlanMap::iterator it;
+         it = m_plans.find( PlanKey(nfft,inverse) );
+         if (it == m_plans.end() ) {
+        // create new entry
+        it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
+        MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
+        if (it2 != m_plans.end() ) {
+        it->second = it2.second;
+        it->second.conjugate();
+        }else{
+        it->second.make_twiddles(nfft,inverse);
+        it->second.factorize(nfft);
+        }
         }
         return it->second;
         */
         PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
         if ( pd.m_twiddles.size() == 0 ) {
-            pd.make_twiddles(nfft,inverse);
-            pd.factorize(nfft);
+          pd.make_twiddles(nfft,inverse);
+          pd.factorize(nfft);
         }
         return pd;
-    }
+      }
 
-    Complex * real_twiddles(int ncfft2)
-    {
+      Complex * real_twiddles(int ncfft2)
+      {
         std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
         if ( (int)twidref.size() != ncfft2 ) {
-            twidref.resize(ncfft2);
-            int ncfft= ncfft2<<1;
-            Scalar pi =  acos( Scalar(-1) );
-            for (int k=1;k<=ncfft2;++k) 
-                twidref[k-1] = exp( Complex(0,-pi * ((double) (k) / ncfft + .5) ) );
+          twidref.resize(ncfft2);
+          int ncfft= ncfft2<<1;
+          Scalar pi =  acos( Scalar(-1) );
+          for (int k=1;k<=ncfft2;++k) 
+            twidref[k-1] = exp( Complex(0,-pi * ((double) (k) / ncfft + .5) ) );
         }
         return &twidref[0];
-    }
+      }
 
-    void scale(Complex *dst,int n,Scalar s) 
-    {
+      void scale(Complex *dst,int n,Scalar s) 
+      {
         for (int k=0;k<n;++k)
-            dst[k] *= s;
-    }
+          dst[k] *= s;
+      }
   };
 }

From f13e000b454fed0a9b0d71ca1481d6894bacd8eb Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 27 May 2009 21:32:42 -0400
Subject: [PATCH 11/18] various comment changes

---
 unsupported/Eigen/FFT                       |   6 +-
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 472 ++++++++++----------
 2 files changed, 231 insertions(+), 247 deletions(-)

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 3d852f5a2..31d8c74c5 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -1,5 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
+// for linear algebra. 
 //
 // Copyright (C) 2009 Mark Borgerding mark a borgerding net
 //
@@ -29,14 +29,14 @@
 #include "src/FFT/ei_kissfft_impl.h"
 #define DEFAULT_FFT_IMPL ei_kissfft_impl
 
-// FFTW: faster, GPL-not LGPL, bigger code size
+// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
 // TODO 
 // #include "src/FFT/ei_fftw_impl.h"
 // #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
-// intel Math Kernel Library: fastest, commerical
+// intel Math Kernel Library: fastest, commerical -- incompatible with Eigen in GPL form
 #ifdef _MKL_DFTI_H_ // mkl_dfti.h has been included, we can use MKL FFT routines
 // TODO 
 // #include "src/FFT/ei_imkl_impl.h"
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 453c7f6da..91fa5ca18 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -1,5 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
+// for linear algebra.
 //
 // Copyright (C) 2009 Mark Borgerding mark a borgerding net
 //
@@ -28,252 +28,255 @@
 
 namespace Eigen {
 
+  // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
+  // Copyright 2003-2009 Mark Borgerding
+
   template <typename _Scalar>
-  struct ei_kiss_cpx_fft
-  {
-    typedef  _Scalar Scalar;
-    typedef  std::complex<Scalar> Complex;
-    std::vector<Complex> m_twiddles;
-    std::vector<int> m_stageRadix;
-    std::vector<int> m_stageRemainder;
-    bool m_inverse;
-
-    void make_twiddles(int nfft,bool inverse)
+    struct ei_kiss_cpx_fft
     {
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
-    }
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
+      std::vector<Complex> m_twiddles;
+      std::vector<int> m_stageRadix;
+      std::vector<int> m_stageRemainder;
+      bool m_inverse;
 
-    void conjugate()
-    {
-      m_inverse = !m_inverse;
-      for ( size_t i=0;i<m_twiddles.size() ;++i)
-        m_twiddles[i] = conj( m_twiddles[i] );
-    }
+      void make_twiddles(int nfft,bool inverse)
+      {
+        m_inverse = inverse;
+        m_twiddles.resize(nfft);
+        Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
+        for (int i=0;i<nfft;++i)
+          m_twiddles[i] = exp( Complex(0,i*phinc) );
+      }
 
-    void factorize(int nfft)
-    {
-      //start factoring out 4's, then 2's, then 3,5,7,9,...
-      int n= nfft;
-      int p=4;
-      do {
-        while (n % p) {
-          switch (p) {
-            case 4: p = 2; break;
-            case 2: p = 3; break;
-            default: p += 2; break;
+      void conjugate()
+      {
+        m_inverse = !m_inverse;
+        for ( size_t i=0;i<m_twiddles.size() ;++i)
+          m_twiddles[i] = conj( m_twiddles[i] );
+      }
+
+      void factorize(int nfft)
+      {
+        //start factoring out 4's, then 2's, then 3,5,7,9,...
+        int n= nfft;
+        int p=4;
+        do {
+          while (n % p) {
+            switch (p) {
+              case 4: p = 2; break;
+              case 2: p = 3; break;
+              default: p += 2; break;
+            }
+            if (p*p>n)
+              p=n;// impossible to have a factor > sqrt(n)
+          }
+          n /= p;
+          m_stageRadix.push_back(p);
+          m_stageRemainder.push_back(n);
+        }while(n>1);
+      }
+
+      template <typename _Src>
+        void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+        {
+          int p = m_stageRadix[stage];
+          int m = m_stageRemainder[stage];
+          Complex * Fout_beg = xout;
+          Complex * Fout_end = xout + p*m;
+
+          if (m>1) {
+            do{
+              // recursive call:
+              // DFT of size m*p performed by doing
+              // p instances of smaller DFTs of size m, 
+              // each one takes a decimated version of the input
+              work(stage+1, xout , xin, fstride*p,in_stride);
+              xin += fstride*in_stride;
+            }while( (xout += m) != Fout_end );
+          }else{
+            do{
+              *xout = *xin;
+              xin += fstride*in_stride;
+            }while(++xout != Fout_end );
+          }
+          xout=Fout_beg;
+
+          // recombine the p smaller DFTs 
+          switch (p) {
+            case 2: bfly2(xout,fstride,m); break;
+            case 3: bfly3(xout,fstride,m); break;
+            case 4: bfly4(xout,fstride,m); break;
+            case 5: bfly5(xout,fstride,m); break;
+            default: bfly_generic(xout,fstride,m,p); break;
           }
-          if (p*p>n)
-            p=n;// impossible to have a factor > sqrt(n)
         }
-        n /= p;
-        m_stageRadix.push_back(p);
-        m_stageRemainder.push_back(n);
-      }while(n>1);
-    }
 
-    template <typename _Src>
-    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
-    {
-      int p = m_stageRadix[stage];
-      int m = m_stageRemainder[stage];
-      Complex * Fout_beg = xout;
-      Complex * Fout_end = xout + p*m;
+      void bfly2( Complex * Fout, const size_t fstride, int m)
+      {
+        for (int k=0;k<m;++k) {
+          Complex t = Fout[m+k] * m_twiddles[k*fstride];
+          Fout[m+k] = Fout[k] - t;
+          Fout[k] += t;
+        }
+      }
+
+      void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+      {
+        Complex scratch[6];
+        int negative_if_inverse = m_inverse * -2 +1;
+        for (size_t k=0;k<m;++k) {
+          scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+          scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+          scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+          scratch[5] = Fout[k] - scratch[1];
+
+          Fout[k] += scratch[1];
+          scratch[3] = scratch[0] + scratch[2];
+          scratch[4] = scratch[0] - scratch[2];
+          scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+          Fout[k+2*m]  = Fout[k] - scratch[3];
+          Fout[k] += scratch[3];
+          Fout[k+m] = scratch[5] + scratch[4];
+          Fout[k+3*m] = scratch[5] - scratch[4];
+        }
+      }
+
+      void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+      {
+        size_t k=m;
+        const size_t m2 = 2*m;
+        Complex *tw1,*tw2;
+        Complex scratch[5];
+        Complex epi3;
+        epi3 = m_twiddles[fstride*m];
+
+        tw1=tw2=&m_twiddles[0];
 
-      if (m>1) {
         do{
-          // recursive call:
-          // DFT of size m*p performed by doing
-          // p instances of smaller DFTs of size m, 
-          // each one takes a decimated version of the input
-          work(stage+1, xout , xin, fstride*p,in_stride);
-          xin += fstride*in_stride;
-        }while( (xout += m) != Fout_end );
-      }else{
-        do{
-          *xout = *xin;
-          xin += fstride*in_stride;
-        }while(++xout != Fout_end );
+          scratch[1]=Fout[m] * *tw1;
+          scratch[2]=Fout[m2] * *tw2;
+
+          scratch[3]=scratch[1]+scratch[2];
+          scratch[0]=scratch[1]-scratch[2];
+          tw1 += fstride;
+          tw2 += fstride*2;
+          Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
+          scratch[0] *= epi3.imag();
+          *Fout += scratch[3];
+          Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+          Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+          ++Fout;
+        }while(--k);
       }
-      xout=Fout_beg;
 
-      // recombine the p smaller DFTs 
-      switch (p) {
-        case 2: bfly2(xout,fstride,m); break;
-        case 3: bfly3(xout,fstride,m); break;
-        case 4: bfly4(xout,fstride,m); break;
-        case 5: bfly5(xout,fstride,m); break;
-        default: bfly_generic(xout,fstride,m,p); break;
-      }
-    }
+      void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+      {
+        Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+        size_t u;
+        Complex scratch[13];
+        Complex * twiddles = &m_twiddles[0];
+        Complex *tw;
+        Complex ya,yb;
+        ya = twiddles[fstride*m];
+        yb = twiddles[fstride*2*m];
 
-    void bfly2( Complex * Fout, const size_t fstride, int m)
-    {
-      for (int k=0;k<m;++k) {
-        Complex t = Fout[m+k] * m_twiddles[k*fstride];
-        Fout[m+k] = Fout[k] - t;
-        Fout[k] += t;
-      }
-    }
+        Fout0=Fout;
+        Fout1=Fout0+m;
+        Fout2=Fout0+2*m;
+        Fout3=Fout0+3*m;
+        Fout4=Fout0+4*m;
 
-    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex scratch[6];
-      int negative_if_inverse = m_inverse * -2 +1;
-      for (size_t k=0;k<m;++k) {
-        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-        scratch[5] = Fout[k] - scratch[1];
+        tw=twiddles;
+        for ( u=0; u<m; ++u ) {
+          scratch[0] = *Fout0;
 
-        Fout[k] += scratch[1];
-        scratch[3] = scratch[0] + scratch[2];
-        scratch[4] = scratch[0] - scratch[2];
-        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+          scratch[1]  = *Fout1 * tw[u*fstride];
+          scratch[2]  = *Fout2 * tw[2*u*fstride];
+          scratch[3]  = *Fout3 * tw[3*u*fstride];
+          scratch[4]  = *Fout4 * tw[4*u*fstride];
 
-        Fout[k+2*m]  = Fout[k] - scratch[3];
-        Fout[k] += scratch[3];
-        Fout[k+m] = scratch[5] + scratch[4];
-        Fout[k+3*m] = scratch[5] - scratch[4];
-      }
-    }
+          scratch[7] = scratch[1] + scratch[4];
+          scratch[10] = scratch[1] - scratch[4];
+          scratch[8] = scratch[2] + scratch[3];
+          scratch[9] = scratch[2] - scratch[3];
 
-    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      size_t k=m;
-      const size_t m2 = 2*m;
-      Complex *tw1,*tw2;
-      Complex scratch[5];
-      Complex epi3;
-      epi3 = m_twiddles[fstride*m];
+          *Fout0 +=  scratch[7];
+          *Fout0 +=  scratch[8];
 
-      tw1=tw2=&m_twiddles[0];
-
-      do{
-        scratch[1]=Fout[m] * *tw1;
-        scratch[2]=Fout[m2] * *tw2;
-
-        scratch[3]=scratch[1]+scratch[2];
-        scratch[0]=scratch[1]-scratch[2];
-        tw1 += fstride;
-        tw2 += fstride*2;
-        Fout[m] = Complex( Fout->real() - .5*scratch[3].real() , Fout->imag() - .5*scratch[3].imag() );
-        scratch[0] *= epi3.imag();
-        *Fout += scratch[3];
-        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-        ++Fout;
-      }while(--k);
-    }
-
-    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-      size_t u;
-      Complex scratch[13];
-      Complex * twiddles = &m_twiddles[0];
-      Complex *tw;
-      Complex ya,yb;
-      ya = twiddles[fstride*m];
-      yb = twiddles[fstride*2*m];
-
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      tw=twiddles;
-      for ( u=0; u<m; ++u ) {
-        scratch[0] = *Fout0;
-
-        scratch[1]  = *Fout1 * tw[u*fstride];
-        scratch[2]  = *Fout2 * tw[2*u*fstride];
-        scratch[3]  = *Fout3 * tw[3*u*fstride];
-        scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-        scratch[7] = scratch[1] + scratch[4];
-        scratch[10] = scratch[1] - scratch[4];
-        scratch[8] = scratch[2] + scratch[3];
-        scratch[9] = scratch[2] - scratch[3];
-
-        *Fout0 +=  scratch[7];
-        *Fout0 +=  scratch[8];
-
-        scratch[5] = scratch[0] + Complex(
-            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-            );
-
-        scratch[6] = Complex(
-            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-            );
-
-        *Fout1 = scratch[5] - scratch[6];
-        *Fout4 = scratch[5] + scratch[6];
-
-        scratch[11] = scratch[0] +
-          Complex(
-              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+          scratch[5] = scratch[0] + Complex(
+              (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+              (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
               );
 
-        scratch[12] = Complex(
-            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-            );
+          scratch[6] = Complex(
+              (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+              -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+              );
 
-        *Fout2=scratch[11]+scratch[12];
-        *Fout3=scratch[11]-scratch[12];
+          *Fout1 = scratch[5] - scratch[6];
+          *Fout4 = scratch[5] + scratch[6];
 
-        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-    }
+          scratch[11] = scratch[0] +
+            Complex(
+                (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+                (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+                );
 
-    /* perform the butterfly for one stage of a mixed radix FFT */
-    void bfly_generic(
-        Complex * Fout,
-        const size_t fstride,
-        int m,
-        int p
-        )
-    {
-      int u,k,q1,q;
-      Complex * twiddles = &m_twiddles[0];
-      Complex t;
-      int Norig = m_twiddles.size();
-      Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+          scratch[12] = Complex(
+              -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+              (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+              );
 
-      for ( u=0; u<m; ++u ) {
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          scratchbuf[q1] = Fout[ k  ];
-          k += m;
+          *Fout2=scratch[11]+scratch[12];
+          *Fout3=scratch[11]-scratch[12];
+
+          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
         }
+      }
 
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          int twidx=0;
-          Fout[ k ] = scratchbuf[0];
-          for (q=1;q<p;++q ) {
-            twidx += fstride * k;
-            if (twidx>=Norig) twidx-=Norig;
-            t=scratchbuf[q] * twiddles[twidx];
-            Fout[ k ] += t;
+      /* perform the butterfly for one stage of a mixed radix FFT */
+      void bfly_generic(
+          Complex * Fout,
+          const size_t fstride,
+          int m,
+          int p
+          )
+      {
+        int u,k,q1,q;
+        Complex * twiddles = &m_twiddles[0];
+        Complex t;
+        int Norig = m_twiddles.size();
+        Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+
+        for ( u=0; u<m; ++u ) {
+          k=u;
+          for ( q1=0 ; q1<p ; ++q1 ) {
+            scratchbuf[q1] = Fout[ k  ];
+            k += m;
+          }
+
+          k=u;
+          for ( q1=0 ; q1<p ; ++q1 ) {
+            int twidx=0;
+            Fout[ k ] = scratchbuf[0];
+            for (q=1;q<p;++q ) {
+              twidx += fstride * k;
+              if (twidx>=Norig) twidx-=Norig;
+              t=scratchbuf[q] * twiddles[twidx];
+              Fout[ k ] += t;
+            }
+            k += m;
           }
-          k += m;
         }
       }
-    }
-  };
+    };
 
   template <typename _Scalar>
-  struct ei_kissfft_impl
-  {
+    struct ei_kissfft_impl
+    {
       typedef _Scalar Scalar;
       typedef std::complex<Scalar> Complex;
 
@@ -284,10 +287,10 @@ namespace Eigen {
       }
 
       template <typename _Src>
-      void fwd( Complex * dst,const _Src *src,int nfft)
-      {
-        get_plan(nfft,false).work(0, dst, src, 1,1);
-      }
+        void fwd( Complex * dst,const _Src *src,int nfft)
+        {
+          get_plan(nfft,false).work(0, dst, src, 1,1);
+        }
 
       // real-to-complex forward FFT
       // perform two FFTs of src even and src odd
@@ -363,11 +366,10 @@ namespace Eigen {
         }
       }
 
-  private:
-
+      private:
       typedef ei_kiss_cpx_fft<Scalar> PlanData;
-
       typedef std::map<int,PlanData> PlanMap;
+
       PlanMap m_plans;
       std::map<int, std::vector<Complex> > m_realTwiddles;
       std::vector<Complex> m_scratchBuf;
@@ -376,25 +378,7 @@ namespace Eigen {
 
       PlanData & get_plan(int nfft,bool inverse)
       {
-        /*  TODO: figure out why this does not work (g++ 4.3.2)
-         * for some reason this does not work
-         *
-         PlanMap::iterator it;
-         it = m_plans.find( PlanKey(nfft,inverse) );
-         if (it == m_plans.end() ) {
-        // create new entry
-        it = m_plans.insert( make_pair( PlanKey(nfft,inverse) , PlanData() ) );
-        MapIt it2 = m_plans.find( PlanKey(nfft,!inverse) );
-        if (it2 != m_plans.end() ) {
-        it->second = it2.second;
-        it->second.conjugate();
-        }else{
-        it->second.make_twiddles(nfft,inverse);
-        it->second.factorize(nfft);
-        }
-        }
-        return it->second;
-        */
+        // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
         PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
         if ( pd.m_twiddles.size() == 0 ) {
           pd.make_twiddles(nfft,inverse);
@@ -421,5 +405,5 @@ namespace Eigen {
         for (int k=0;k<n;++k)
           dst[k] *= s;
       }
-  };
+    };
 }

From 1fd6dfe428e6b65383b84f8dfe8f67c70ec370b4 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 30 May 2009 17:55:47 -0400
Subject: [PATCH 12/18] added ei_fftw_impl

---
 unsupported/Eigen/FFT                       |   5 +-
 unsupported/Eigen/src/FFT/ei_fftw_impl.h    | 198 ++++++++++++++++++++
 unsupported/Eigen/src/FFT/ei_kissfft_impl.h |  25 +--
 3 files changed, 214 insertions(+), 14 deletions(-)
 create mode 100644 unsupported/Eigen/src/FFT/ei_fftw_impl.h

diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 31d8c74c5..03f8504a4 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -30,9 +30,8 @@
 #define DEFAULT_FFT_IMPL ei_kissfft_impl
 
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
-#ifdef FFTW_PATIENT  // definition of FFTW_PATIENT indicates the caller has included fftw3.h, we can use FFTW routines
-// TODO 
-// #include "src/FFT/ei_fftw_impl.h"
+#ifdef FFTW_ESTIMATE  // definition of FFTW_ESTIMATE indicates the caller has included fftw3.h, we can use FFTW routines
+#include "src/FFT/ei_fftw_impl.h"
 // #define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
diff --git a/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
new file mode 100644
index 000000000..d592bbb20
--- /dev/null
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -0,0 +1,198 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. 
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+namespace Eigen {
+  // FFTW uses non-const arguments
+  // so we must use ugly const_cast calls for all the args it uses
+  //
+  // This should be safe as long as 
+  // 1. we use FFTW_ESTIMATE for all our planning
+  //       see the FFTW docs section 4.3.2 "Planner Flags"
+  // 2. fftw_complex is compatible with std::complex
+  //    This assumes std::complex<T> layout is array of size 2 with real,imag
+  template <typename T> 
+  T * ei_fftw_cast(const T* p) 
+  { 
+      return const_cast<T*>( p); 
+  }
+
+  fftw_complex * ei_fftw_cast( const std::complex<double> * p) 
+  { 
+      return const_cast<fftw_complex*>( reinterpret_cast<const fftw_complex*>(p) ); 
+  }
+
+  fftwf_complex * ei_fftw_cast( const std::complex<float> * p) 
+  { 
+      return const_cast<fftwf_complex*>( reinterpret_cast<const fftwf_complex*>(p) ); 
+  }
+
+  fftwl_complex * ei_fftw_cast( const std::complex<long double> * p) 
+  { 
+      return const_cast<fftwl_complex*>( reinterpret_cast<const fftwl_complex*>(p) ); 
+  }
+
+  template <typename T> 
+  struct ei_fftw_plan {};
+
+  template <> 
+  struct ei_fftw_plan<float>
+  {
+      typedef float scalar_type;
+      typedef fftwf_complex complex_type;
+      fftwf_plan m_plan;
+      ei_fftw_plan() :m_plan(NULL) {}
+      ~ei_fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);}
+
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
+          fftwf_execute_dft( m_plan, src,dst);
+      }
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwf_execute_dft_r2c( m_plan,src,dst);
+      }
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwf_execute_dft_c2r( m_plan, src,dst);
+      }
+  };
+  template <> 
+  struct ei_fftw_plan<double>
+  {
+      typedef double scalar_type;
+      typedef fftw_complex complex_type;
+      fftw_plan m_plan;
+      ei_fftw_plan() :m_plan(NULL) {}
+      ~ei_fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);}
+
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
+          fftw_execute_dft( m_plan, src,dst);
+      }
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftw_execute_dft_r2c( m_plan,src,dst);
+      }
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftw_execute_dft_c2r( m_plan, src,dst);
+      }
+  };
+  template <> 
+  struct ei_fftw_plan<long double>
+  {
+      typedef long double scalar_type;
+      typedef fftwl_complex complex_type;
+      fftwl_plan m_plan;
+      ei_fftw_plan() :m_plan(NULL) {}
+      ~ei_fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);}
+
+      void fwd(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      void inv(complex_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE);
+          fftwl_execute_dft( m_plan, src,dst);
+      }
+      void fwd(complex_type * dst,scalar_type * src,int nfft) {
+          if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwl_execute_dft_r2c( m_plan,src,dst);
+      }
+      void inv(scalar_type * dst,complex_type * src,int nfft) {
+          if (m_plan==NULL)
+              m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE);
+          fftwl_execute_dft_c2r( m_plan, src,dst);
+      }
+  };
+
+  template <typename _Scalar>
+  struct ei_fftw_impl
+  {
+      typedef _Scalar Scalar;
+      typedef std::complex<Scalar> Complex;
+
+      void clear() 
+      {
+        m_plans.clear();
+      }
+
+      void fwd( Complex * dst,const Complex *src,int nfft)
+      {
+        get_plan(nfft,false,dst,src).fwd(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+      }
+
+      // real-to-complex forward FFT
+      void fwd( Complex * dst,const Scalar * src,int nfft) 
+      {
+          get_plan(nfft,false,dst,src).fwd(ei_fftw_cast(dst), ei_fftw_cast(src) ,nfft);
+          int nhbins=(nfft>>1)+1;
+          for (int k=nhbins;k < nfft; ++k )
+              dst[k] = conj(dst[nfft-k]);
+      }
+
+      // inverse complex-to-complex
+      void inv(Complex * dst,const Complex  *src,int nfft)
+      {
+        get_plan(nfft,true,dst,src).inv(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+        // scaling
+        Scalar s = 1./nfft;
+        for (int k=0;k<nfft;++k)
+          dst[k] *= s;
+      }
+
+      // half-complex to scalar
+      void inv( Scalar * dst,const Complex * src,int nfft) 
+      {
+        get_plan(nfft,true,dst,src).inv(ei_fftw_cast(dst), ei_fftw_cast(src),nfft );
+        Scalar s = 1./nfft;
+        for (int k=0;k<nfft;++k)
+          dst[k] *= s;
+      }
+
+  private:
+      typedef ei_fftw_plan<Scalar> PlanData;
+      typedef std::map<int,PlanData> PlanMap;
+
+      PlanMap m_plans;
+
+      PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src)
+      {
+          bool inplace = (dst==src);
+          bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
+          int key = (nfft<<3 ) | (inverse<<2) | (inplace<<1) | aligned;
+          return m_plans[key];
+      }
+  };
+}
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 91fa5ca18..a84ac68a0 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -39,6 +39,7 @@ namespace Eigen {
       std::vector<Complex> m_twiddles;
       std::vector<int> m_stageRadix;
       std::vector<int> m_stageRemainder;
+      std::vector<Complex> m_scratchBuf;
       bool m_inverse;
 
       void make_twiddles(int nfft,bool inverse)
@@ -75,6 +76,8 @@ namespace Eigen {
           n /= p;
           m_stageRadix.push_back(p);
           m_stageRemainder.push_back(n);
+          if ( p > 5 )
+              m_scratchBuf.resize(p); // scratchbuf will be needed in bfly_generic
         }while(n>1);
       }
 
@@ -249,7 +252,7 @@ namespace Eigen {
         Complex * twiddles = &m_twiddles[0];
         Complex t;
         int Norig = m_twiddles.size();
-        Complex * scratchbuf = (Complex*)alloca(p*sizeof(Complex) );
+        Complex * scratchbuf = &m_scratchBuf[0];
 
         for ( u=0; u<m; ++u ) {
           k=u;
@@ -341,28 +344,28 @@ namespace Eigen {
       void inv( Scalar * dst,const Complex * src,int nfft) 
       {
         if (nfft&3) {
-          m_scratchBuf.resize(nfft);
-          inv(&m_scratchBuf[0],src,nfft);
+          m_tmpBuf.resize(nfft);
+          inv(&m_tmpBuf[0],src,nfft);
           for (int k=0;k<nfft;++k)
-            dst[k] = m_scratchBuf[k].real();
+            dst[k] = m_tmpBuf[k].real();
         }else{
           // optimized version for multiple of 4
           int ncfft = nfft>>1;
           int ncfft2 = nfft>>2;
           Complex * rtw = real_twiddles(ncfft2);
-          m_scratchBuf.resize(ncfft);
-          m_scratchBuf[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
+          m_tmpBuf.resize(ncfft);
+          m_tmpBuf[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
           for (int k = 1; k <= ncfft / 2; ++k) {
             Complex fk = src[k];
             Complex fnkc = conj(src[ncfft-k]);
             Complex fek = fk + fnkc;
             Complex tmp = fk - fnkc;
             Complex fok = tmp * conj(rtw[k-1]);
-            m_scratchBuf[k] = fek + fok;
-            m_scratchBuf[ncfft-k] = conj(fek - fok);
+            m_tmpBuf[k] = fek + fok;
+            m_tmpBuf[ncfft-k] = conj(fek - fok);
           }
-          scale(&m_scratchBuf[0], ncfft, Scalar(1)/nfft );
-          get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_scratchBuf[0], 1,1);
+          scale(&m_tmpBuf[0], ncfft, Scalar(1)/nfft );
+          get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_tmpBuf[0], 1,1);
         }
       }
 
@@ -372,7 +375,7 @@ namespace Eigen {
 
       PlanMap m_plans;
       std::map<int, std::vector<Complex> > m_realTwiddles;
-      std::vector<Complex> m_scratchBuf;
+      std::vector<Complex> m_tmpBuf;
 
       int PlanKey(int nfft,bool isinverse) const { return (nfft<<1) | isinverse; }
 

From 1c543401741f1328b65d30a764f6273ddf9b60b6 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sun, 31 May 2009 15:44:57 -0400
Subject: [PATCH 13/18] more work on ei_fftw_impl

---
 bench/benchFFT.cpp       | 4 ++++
 unsupported/Eigen/FFT    | 3 ++-
 unsupported/test/FFT.cpp | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 14f5063fb..4b6cabb55 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -26,6 +26,10 @@
 #include <vector>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
 #include <unsupported/Eigen/FFT>
 
 using namespace Eigen;
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 03f8504a4..dc7e85908 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -32,7 +32,8 @@
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #ifdef FFTW_ESTIMATE  // definition of FFTW_ESTIMATE indicates the caller has included fftw3.h, we can use FFTW routines
 #include "src/FFT/ei_fftw_impl.h"
-// #define DEFAULT_FFT_IMPL ei_fftw_impl
+#undef DEFAULT_FFT_IMPL
+#define DEFAULT_FFT_IMPL ei_fftw_impl
 #endif
 
 // intel Math Kernel Library: fastest, commerical -- incompatible with Eigen in GPL form
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 32d1393d0..28230e1c4 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -23,6 +23,11 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
+//#define USE_FFTW
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
 #include <unsupported/Eigen/FFT>
 
 using namespace std;

From 4d6b962ba47ca2a6445bebd144b8122f3b8e96b7 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 10 Jun 2009 22:16:32 -0400
Subject: [PATCH 14/18] added FindFFTW, but I don't think it's right yet

---
 cmake/FindFFTW.cmake            | 24 ++++++++++++++++++++++++
 unsupported/test/CMakeLists.txt |  6 ++++++
 unsupported/test/FFT.cpp        | 12 +++---------
 3 files changed, 33 insertions(+), 9 deletions(-)
 create mode 100644 cmake/FindFFTW.cmake

diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake
new file mode 100644
index 000000000..a56450b17
--- /dev/null
+++ b/cmake/FindFFTW.cmake
@@ -0,0 +1,24 @@
+
+if (FFTW_INCLUDES AND FFTW_LIBRARIES)
+  set(FFTW_FIND_QUIETLY TRUE)
+endif (FFTW_INCLUDES AND FFTW_LIBRARIES)
+
+find_path(FFTW_INCLUDES
+  NAMES
+  fftw3.h
+  PATHS
+  $ENV{FFTWDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+find_library(FFTWF_LIB NAMES fftw3f PATHS $ENV{FFTWDIR} ${LIB_INSTALL_DIR})
+find_library(FFTW_LIB NAMES fftw3 PATHS $ENV{FFTWDIR} ${LIB_INSTALL_DIR})
+find_library(FFTWL_LIB NAMES fftw3l  PATHS $ENV{FFTWDIR} ${LIB_INSTALL_DIR})
+set(FFTW_LIBRARIES "${FFTWF_LIB} ${FFTW_LIB} ${FFTWL_LIB}" )
+message(STATUS "FFTW ${FFTW_LIBRARIES}" )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW DEFAULT_MSG
+                                  FFTW_INCLUDES FFTW_LIBRARIES)
+
+mark_as_advanced(FFTW_INCLUDES FFTW_LIBRARIES)
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 49de7dfb2..23217811f 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -20,3 +20,9 @@ ei_add_test(BVH)
 ei_add_test(matrixExponential)
 ei_add_test(alignedvector3)
 ei_add_test(FFT)
+
+find_package(FFTW)
+if(FFTW_FOUND)
+  ei_add_test(FFTW  " " "${FFTW_LIBRARIES}" )
+endif(FFTW_FOUND)
+
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 28230e1c4..f0b9b68bf 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -23,11 +23,6 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #include "main.h"
-//#define USE_FFTW
-#ifdef USE_FFTW
-#include <fftw3.h>
-#endif
-
 #include <unsupported/Eigen/FFT>
 
 using namespace std;
@@ -121,7 +116,7 @@ void test_complex(int nfft)
 
 void test_FFT()
 {
-#if 1
+
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
   CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
@@ -129,13 +124,12 @@ void test_FFT()
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
   CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-#endif
 
-#if 1
+
+
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
   CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
   CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
-#endif
 }

From 218711e18b1adc518a2f861fe53d5ce9da0e98b7 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 10 Jun 2009 23:25:27 -0400
Subject: [PATCH 15/18] example file

---
 unsupported/doc/examples/FFT.cpp | 117 +++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 unsupported/doc/examples/FFT.cpp

diff --git a/unsupported/doc/examples/FFT.cpp b/unsupported/doc/examples/FFT.cpp
new file mode 100644
index 000000000..55e29585a
--- /dev/null
+++ b/unsupported/doc/examples/FFT.cpp
@@ -0,0 +1,117 @@
+//  To use the simple FFT implementation
+//  g++ -o demofft -I.. -Wall -O3 FFT.cpp 
+
+//  To use the FFTW implementation
+//  g++ -o demofft -I.. -DUSE_FFTW -Wall -O3 FFT.cpp -lfftw3 -lfftw3f -lfftw3l
+
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
+#include <vector>
+#include <complex>
+#include <algorithm>
+#include <iterator>
+#include <Eigen/Core>
+#include <unsupported/Eigen/FFT>
+
+using namespace std;
+using namespace Eigen;
+
+template <typename T>
+T mag2(T a)
+{
+    return a*a;
+}
+template <typename T>
+T mag2(std::complex<T> a)
+{
+    return norm(a);
+}
+
+template <typename T>
+T mag2(const std::vector<T> & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+T mag2(const std::vector<std::complex<T> > & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+vector<T> operator-(const vector<T> & a,const vector<T> & b )
+{
+    vector<T> c(a);
+    for (size_t k=0;k<b.size();++k) 
+        c[k] -= b[k];
+    return c;
+}
+
+template <typename T>
+void RandomFill(std::vector<T> & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = T( rand() )/T(RAND_MAX) - .5;
+}
+
+template <typename T>
+void RandomFill(std::vector<std::complex<T> > & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - .5, T( rand() )/T(RAND_MAX) - .5);
+}
+
+template <typename T_time,typename T_freq>
+void fwd_inv(size_t nfft)
+{
+    typedef typename NumTraits<T_freq>::Real Scalar;
+    vector<T_time> timebuf(nfft);
+    RandomFill(timebuf);
+
+    vector<T_freq> freqbuf;
+    static FFT<Scalar> fft;
+    fft.fwd(freqbuf,timebuf);
+
+    vector<T_time> timebuf2;
+    fft.inv(timebuf2,freqbuf);
+
+    long double rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    cout << "roundtrip rmse: " << rmse << endl;
+}
+
+template <typename T_scalar>
+void two_demos(int nfft)
+{
+    cout << "     scalar ";
+    fwd_inv<T_scalar,std::complex<T_scalar> >(nfft);
+    cout << "    complex ";
+    fwd_inv<std::complex<T_scalar>,std::complex<T_scalar> >(nfft);
+}
+
+void demo_all_types(int nfft)
+{
+    cout << "nfft=" << nfft << endl;
+    cout << "   float" << endl;
+    two_demos<float>(nfft);
+    cout << "   double" << endl;
+    two_demos<double>(nfft);
+    cout << "   long double" << endl;
+    two_demos<long double>(nfft);
+}
+
+int main()
+{
+    demo_all_types( 2*3*4*5*7 );
+    demo_all_types( 2*9*16*25 );
+    demo_all_types( 1024 );
+    return 0;
+}

From e577c70e49dddd88c83175921431d2725128b0f3 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 16 Jun 2009 23:54:58 -0400
Subject: [PATCH 16/18] candidate header for Eigen/Complex

---
 unsupported/Eigen/Complex | 142 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 unsupported/Eigen/Complex

diff --git a/unsupported/Eigen/Complex b/unsupported/Eigen/Complex
new file mode 100644
index 000000000..f8d9af25c
--- /dev/null
+++ b/unsupported/Eigen/Complex
@@ -0,0 +1,142 @@
+#ifndef EIGEN_COMPLEX_H
+#define EIGEN_COMPLEX_H
+
+#include <complex>
+
+namespace Eigen {
+
+template <typename _NativePtr,typename _PunnedPtr>
+struct castable_pointer
+{
+    castable_pointer(_NativePtr ptr) : _ptr(ptr) {}
+    operator _NativePtr ()  {return _ptr;}
+    operator _PunnedPtr ()  {return reinterpret_cast<_PunnedPtr>(_ptr);}
+    private: 
+    _NativePtr _ptr;
+};
+
+template <typename T>
+struct Complex
+{
+    typedef typename std::complex<T> StandardComplex;
+    typedef T value_type;
+
+    Complex(const T& re = T(), const T& im = T()) : _re(re),_im(im) { }
+    Complex(const Complex&other ): _re(other.real()) ,_im(other.imag()) {}
+
+    template<class X> 
+    Complex(const Complex<X>&other): _re(other.real()) ,_im(other.imag()) {}
+    template<class X> 
+    Complex(const std::complex<X>&other): _re(other.real()) ,_im(other.imag()) {}
+
+    typedef castable_pointer< Complex<T>*, StandardComplex* > pointer_type;
+    typedef castable_pointer< const Complex<T>*, const StandardComplex* > const_pointer_type;
+
+    pointer_type operator & () {return pointer_type(this);}
+    const_pointer_type operator & () const {return const_pointer_type(this);}
+
+    StandardComplex std_type() const {return StandardComplex(real(),imag());}
+    StandardComplex & std_type() {return *(StandardComplex*)(&(*this));}
+
+    operator StandardComplex () const {return std_type();}
+    operator StandardComplex & () {return std_type();}
+
+    T & real() {return _re;}
+    T & imag() {return _im;}
+    const T & real() const {return _re;}
+    const T & imag() const {return _im;}
+
+    // *** complex member functions: ***
+    Complex<T>& operator= (const T& val) { _re=val;_im=0;return *this;  }
+    Complex<T>& operator+= (const T& val) {_re+=val;return *this;}
+    Complex<T>& operator-= (const T& val) {_re-=val;return *this;}
+    Complex<T>& operator*= (const T& val) {_re*=val;_im*=val;return *this;  }
+    Complex<T>& operator/= (const T& val) {_re/=val;_im/=val;return *this;  }
+
+    Complex& operator= (const Complex& rhs) {_re=rhs._re;_im=rhs._im;return *this;}
+    Complex& operator= (const StandardComplex& rhs) {_re=rhs.real();_im=rhs.imag();return *this;}
+
+    template<class X> Complex<T>& operator= (const Complex<X>& rhs) { _re=rhs._re;_im=rhs._im;return *this;}
+    template<class X> Complex<T>& operator+= (const Complex<X>& rhs) { _re+=rhs._re;_im+=rhs._im;return *this;}
+    template<class X> Complex<T>& operator-= (const Complex<X>& rhs) { _re-=rhs._re;_im-=rhs._im;return *this;}
+    template<class X> Complex<T>& operator*= (const Complex<X>& rhs) { this->std_type() *= rhs.std_type(); return *this; }
+    template<class X> Complex<T>& operator/= (const Complex<X>& rhs) { this->std_type() /= rhs.std_type(); return *this; }
+
+    private:
+    T _re;
+    T _im;
+};
+
+template <typename T>
+T ei_to_std( const T & x) {return x;}
+
+template <typename T>
+std::complex<T> ei_to_std( const Complex<T> & x) {return x.std_type();}
+
+// 26.2.6 operators
+template<class T> Complex<T> operator+(const Complex<T>& rhs) {return rhs;}
+template<class T> Complex<T> operator-(const Complex<T>& rhs) {return -ei_to_std(rhs);}
+
+template<class T> Complex<T> operator+(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) + ei_to_std(rhs);}
+template<class T> Complex<T> operator-(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) - ei_to_std(rhs);}
+template<class T> Complex<T> operator*(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) * ei_to_std(rhs);}
+template<class T> Complex<T> operator/(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) / ei_to_std(rhs);}
+template<class T> bool operator==(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) == ei_to_std(rhs);}
+template<class T> bool operator!=(const Complex<T>& lhs, const Complex<T>& rhs) { return ei_to_std(lhs) != ei_to_std(rhs);}
+
+template<class T> Complex<T> operator+(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) + ei_to_std(rhs); }
+template<class T> Complex<T> operator-(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) - ei_to_std(rhs); }
+template<class T> Complex<T> operator*(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) * ei_to_std(rhs); }
+template<class T> Complex<T> operator/(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) / ei_to_std(rhs); }
+template<class T> bool operator==(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) == ei_to_std(rhs); }
+template<class T> bool operator!=(const Complex<T>& lhs, const T& rhs) {return ei_to_std(lhs) != ei_to_std(rhs); }
+
+template<class T> Complex<T> operator+(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) + ei_to_std(rhs); }
+template<class T> Complex<T> operator-(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) - ei_to_std(rhs); }
+template<class T> Complex<T> operator*(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) * ei_to_std(rhs); }
+template<class T> Complex<T> operator/(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) / ei_to_std(rhs); }
+template<class T> bool operator==(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) == ei_to_std(rhs); }
+template<class T> bool operator!=(const T& lhs, const Complex<T>& rhs) {return ei_to_std(lhs) != ei_to_std(rhs); }
+
+template<class T, class charT, class traits>
+std::basic_istream<charT,traits>&
+  operator>> (std::basic_istream<charT,traits>& istr, Complex<T>& rhs)
+{
+    return istr >> rhs.std_type();
+}
+
+template<class T, class charT, class traits>
+std::basic_ostream<charT,traits>&
+operator<< (std::basic_ostream<charT,traits>& ostr, const Complex<T>& rhs)
+{
+    return ostr << rhs.std_type();
+}
+
+ // 26.2.7 values:
+ template<class T> T real(const Complex<T>&x) {return real(ei_to_std(x));}
+ template<class T> T abs(const Complex<T>&x) {return abs(ei_to_std(x));}
+ template<class T> T arg(const Complex<T>&x) {return arg(ei_to_std(x));}
+ template<class T> T norm(const Complex<T>&x) {return norm(ei_to_std(x));}
+
+ template<class T> Complex<T> conj(const Complex<T>&x) { return conj(ei_to_std(x));}
+ template<class T> Complex<T> polar(const T& x, const T&y) {return polar(ei_to_std(x),ei_to_std(y));} 
+ // 26.2.8 transcendentals:
+ template<class T> Complex<T> cos (const  Complex<T>&x){return cos(ei_to_std(x));}
+ template<class T> Complex<T> cosh (const  Complex<T>&x){return cosh(ei_to_std(x));}
+ template<class T> Complex<T> exp (const  Complex<T>&x){return exp(ei_to_std(x));}
+ template<class T> Complex<T> log (const  Complex<T>&x){return log(ei_to_std(x));}
+ template<class T> Complex<T> log10 (const  Complex<T>&x){return log10(ei_to_std(x));}
+
+ template<class T> Complex<T> pow(const Complex<T>&x, int p) {return pow(ei_to_std(x),ei_to_std(p));}
+ template<class T> Complex<T> pow(const Complex<T>&x, const T&p) {return pow(ei_to_std(x),ei_to_std(p));}
+ template<class T> Complex<T> pow(const Complex<T>&x, const Complex<T>&p) {return pow(ei_to_std(x),ei_to_std(p));}
+ template<class T> Complex<T> pow(const T&x, const Complex<T>&p) {return pow(ei_to_std(x),ei_to_std(p));}
+
+ template<class T> Complex<T> sin (const  Complex<T>&x){return sin(ei_to_std(x));}
+ template<class T> Complex<T> sinh (const  Complex<T>&x){return sinh(ei_to_std(x));}
+ template<class T> Complex<T> sqrt (const  Complex<T>&x){return sqrt(ei_to_std(x));}
+ template<class T> Complex<T> tan (const  Complex<T>&x){return tan(ei_to_std(x));}
+ template<class T> Complex<T> tanh (const  Complex<T>&x){return tanh(ei_to_std(x));}
+}
+
+#endif

From fb9a15e451371f4e600dbb60b8e5c9d098889844 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Wed, 17 Jun 2009 00:09:18 -0400
Subject: [PATCH 17/18] added copyright notice

---
 unsupported/Eigen/Complex | 50 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/unsupported/Eigen/Complex b/unsupported/Eigen/Complex
index f8d9af25c..a0483abdf 100644
--- a/unsupported/Eigen/Complex
+++ b/unsupported/Eigen/Complex
@@ -1,6 +1,34 @@
 #ifndef EIGEN_COMPLEX_H
 #define EIGEN_COMPLEX_H
 
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+// Eigen::Complex reuses as much as possible from std::complex
+// and allows easy conversion to and from, even at the pointer level.
+
+
 #include <complex>
 
 namespace Eigen {
@@ -21,6 +49,7 @@ struct Complex
     typedef typename std::complex<T> StandardComplex;
     typedef T value_type;
 
+    // constructors
     Complex(const T& re = T(), const T& im = T()) : _re(re),_im(im) { }
     Complex(const Complex&other ): _re(other.real()) ,_im(other.imag()) {}
 
@@ -29,22 +58,31 @@ struct Complex
     template<class X> 
     Complex(const std::complex<X>&other): _re(other.real()) ,_im(other.imag()) {}
 
+
+    // allow binary access to the object as a std::complex
     typedef castable_pointer< Complex<T>*, StandardComplex* > pointer_type;
     typedef castable_pointer< const Complex<T>*, const StandardComplex* > const_pointer_type;
-
     pointer_type operator & () {return pointer_type(this);}
     const_pointer_type operator & () const {return const_pointer_type(this);}
 
-    StandardComplex std_type() const {return StandardComplex(real(),imag());}
-    StandardComplex & std_type() {return *(StandardComplex*)(&(*this));}
-
     operator StandardComplex () const {return std_type();}
     operator StandardComplex & () {return std_type();}
 
-    T & real() {return _re;}
-    T & imag() {return _im;}
+    StandardComplex std_type() const {return StandardComplex(real(),imag());}
+    StandardComplex & std_type() {return *reinterpret_cast<StandardComplex*>(this);}
+
+
+    // every sort of accessor and mutator that has ever been in fashion.
+    // For a brief history, search for "std::complex over-encapsulated"
+    // http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#387
     const T & real() const {return _re;}
     const T & imag() const {return _im;}
+    T & real() {return _re;}
+    T & imag() {return _im;}
+    T & real(const T & x) {return _re=x;}
+    T & imag(const T & x) {return _im=x;}
+    void set_real(const T & x) {_re = x;}
+    void set_imag(const T & x) {_im = x;}
 
     // *** complex member functions: ***
     Complex<T>& operator= (const T& val) { _re=val;_im=0;return *this;  }

From a39de276a95234ff85f58fa93364b047f3d72c6b Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Mon, 14 Sep 2009 01:52:26 -0400
Subject: [PATCH 18/18] added the test case for FFTW

---
 unsupported/test/CMakeLists.txt |   2 +-
 unsupported/test/FFTW.cpp       | 136 ++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 unsupported/test/FFTW.cpp

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 23217811f..f42077bdc 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -23,6 +23,6 @@ ei_add_test(FFT)
 
 find_package(FFTW)
 if(FFTW_FOUND)
-  ei_add_test(FFTW  " " "${FFTW_LIBRARIES}" )
+  ei_add_test(FFTW  " " "-lfftw3 -lfftw3f -lfftw3l" )
 endif(FFTW_FOUND)
 
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
new file mode 100644
index 000000000..cf7be75aa
--- /dev/null
+++ b/unsupported/test/FFTW.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+#include <fftw3.h>
+#include <unsupported/Eigen/FFT>
+
+using namespace std;
+
+float norm(float x) {return x*x;}
+double norm(double x) {return x*x;}
+long double norm(long double x) {return x*x;}
+
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>( x); }
+complex<long double>  promote(double x) { return complex<long double>( x); }
+complex<long double>  promote(long double x) { return complex<long double>( x); }
+    
+
+    template <typename T1,typename T2>
+    long double fft_rmse( const vector<T1> & fftbuf,const vector<T2> & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        cerr <<"idx\ttruth\t\tvalue\t|dif|=\n";
+        for (size_t k0=0;k0<fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = -2.*k0* M_PIl / timebuf.size();
+            for (size_t k1=0;k1<timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += norm(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += norm(dif);
+            cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(norm(dif)) << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename T1,typename T2>
+    long double dif_rmse( const vector<T1> buf1,const vector<T2> buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = min( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (norm( buf1[k] ) + norm(buf2[k]) )/2.;
+            difpower += norm(buf1[k] - buf2[k]);
+        }
+        return sqrt(difpower/totalpower);
+    }
+
+template <class T>
+void test_scalar(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    typedef typename Eigen::FFT<T>::Scalar Scalar;
+
+    FFT<T> fft;
+    vector<Scalar> inbuf(nfft);
+    vector<Complex> outbuf;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
+    fft.fwd( outbuf,inbuf);
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    vector<Scalar> buf3;
+    fft.inv( buf3 , outbuf);
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+}
+
+template <class T>
+void test_complex(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+
+    FFT<T> fft;
+
+    vector<Complex> inbuf(nfft);
+    vector<Complex> outbuf;
+    vector<Complex> buf3;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
+
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+}
+
+void test_FFTW()
+{
+
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+
+
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); CALL_SUBTEST( test_scalar<long double>(50) );
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+}