Adding PocketFFT support in FFT module since kissfft has some flaw in accuracy and performance

2025-09-18 20:33:14 +08:00 · 2022-05-11 17:44:22 +00:00 · 2022-05-11 17:44:22 +00:00 · 00b75375e7
commit 00b75375e7
parent 73d65dbc43
8 changed files with 390 additions and 273 deletions
--- a/test/main.h
+++ b/test/main.h
@ -87,7 +87,7 @@
 // protected by parenthesis against macro expansion, the min()/max() macros
 // are defined here and any not-parenthesized min/max call will cause a
 // compiler error.
-#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL)
+#if !defined(__HIPCC__) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_POCKETFFT_DEFAULT)
  //
  // HIP header files include the following files
  //  <thread>
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@ -29,10 +29,19 @@
  * The default implementation is based on kissfft. It is a small, free, and
  * reasonably efficient default.
  *
-  * There are currently two implementation backend:
+  * There are currently four implementation backend:
  *
+  * - kissfft(https://github.com/mborgerding/kissfft) : Simple and not so fast, BSD-3-Clause.
+  *   It is a mixed-radix Fast Fourier Transform based up on the principle, "Keep It Simple, Stupid."
+  *   Notice that:kissfft fails to handle "atypically-sized" inputs(i.e., sizes with large factors),a workaround is using fftw or pocketfft.
  * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
  * - MKL (http://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form.
+  * - pocketfft (https://gitlab.mpcdf.mpg.de/mtr/pocketfft) : faster than kissfft, BSD 3-clause.
+  *   It is a heavily modified implementation of FFTPack, with the following advantages:
+  *   1.strictly C++11 compliant
+  *   2.more accurate twiddle factor computation
+  *   3.very fast plan generation
+  *   4.worst case complexity for transform sizes with large prime factors is N*log(N), because Bluestein's algorithm is used for these cases.
  *
  * \section FFTDesign Design
  *
@ -85,9 +94,16 @@
   namespace Eigen {
     template <typename T> struct default_fft_impl : public internal::imklfft_impl {};
   }
-#else
+#elif defined EIGEN_POCKETFFT_DEFAULT
+// internal::pocketfft_impl: a heavily modified implementation of FFTPack, with many advantages.
+# include<pocketfft_hdronly.h>
+# include"src/FFT/ei_pocketfft_impl.h"
+  namespace Eigen {
+     template <typename T>
+      struct default_fft_impl : public internal::pocketfft_impl<T> {};
+  }
+#else 
 // internal::kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
-//
 # include "src/FFT/ei_kissfft_impl.h"
  namespace Eigen {
     template <typename T> 
@ -195,13 +211,13 @@ class FFT
        m_impl.fwd(dst,src,static_cast<int>(nfft));
    }

-    /*
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT
    inline 
    void fwd2(Complex * dst, const Complex * src, int n0,int n1)
    {
      m_impl.fwd2(dst,src,n0,n1);
    }
-    */
+#endif

    template <typename Input_>
    inline
@ -354,8 +370,7 @@ class FFT
    }


-    /*
-    // TODO: multi-dimensional FFTs
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT 
    inline 
    void inv2(Complex * dst, const Complex * src, int n0,int n1)
    {
@ -363,7 +378,8 @@ class FFT
      if ( HasFlag( Unscaled ) == false)
          scale(dst,1./(n0*n1),n0*n1);
    }
-  */
+#endif
+

    inline
    impl_type & impl() {return m_impl;}
--- a/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_pocketfft_impl.h
@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. 
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+using namespace pocketfft;
+using namespace pocketfft::detail;
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename _Scalar>
+struct pocketfft_impl
+{
+  typedef _Scalar Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  inline void clear() {}
+
+  inline void fwd(Complex* dst, const Scalar* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_in{ sizeof(Scalar) };
+    const stride_t stride_out{ sizeof(Complex) };
+    r2c(shape_, stride_in, stride_out, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void fwd(Complex* dst, const Complex* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_{ sizeof(Complex) };
+    c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv(Scalar* dst,  const Complex* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_in{ sizeof(Complex) };
+    const stride_t stride_out{ sizeof(Scalar) };
+    c2r(shape_, stride_in, stride_out, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+  }  
+
+  inline void inv(Complex* dst, const Complex* src, int nfft){
+    const shape_t  shape_{ static_cast<size_t>(nfft) };
+    const shape_t  axes_{ 0 };
+    const stride_t stride_{ sizeof(Complex) };
+    c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1){
+    const shape_t  shape_{ static_cast<size_t>(nfft0), static_cast<size_t>(nfft1) };
+    const shape_t  axes_{ 0, 1 };
+    const stride_t stride_{ static_cast<ptrdiff_t>(sizeof(Complex)*nfft1), static_cast<ptrdiff_t>(sizeof(Complex)) };
+    c2c(shape_, stride_, stride_, axes_, FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1){
+    const shape_t  shape_{ static_cast<size_t>(nfft0), static_cast<size_t>(nfft1) };
+    const shape_t  axes_{ 0, 1 };
+    const stride_t stride_{ static_cast<ptrdiff_t>(sizeof(Complex)*nfft1), static_cast<ptrdiff_t>(sizeof(Complex)) };
+    c2c(shape_, stride_, stride_, axes_, BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+};
+
+} // namespace internal
+} // namespace Eigen
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -77,6 +77,17 @@ else()
  ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
 endif()

+find_path(POCKETFFT  pocketfft_hdronly.h)
+if(POCKETFFT)
+  if(EIGEN_TEST_CXX11)
+    ei_add_property(EIGEN_TESTED_BACKENDS "pocketfft, ")
+    include_directories( ${POCKETFFT} )
+    ei_add_test(pocketfft "-pthread" "${CMAKE_THREAD_LIBS_INIT}" "-DEIGEN_POCKETFFT_DEFAULT" )  
+  endif()  
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "pocketfft, ")
+endif()
+
 option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
 if(EIGEN_TEST_OPENGL)
  find_package(OpenGL)
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@ -1,2 +1,2 @@
-#define test_FFTW test_FFT
-#include "FFTW.cpp"
+#define EIGEN_FFT_DEFAULT 1
+#include "fft_test_shared.h"
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@ -1,262 +1,2 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Mark Borgerding mark a borgerding net
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-#include <unsupported/Eigen/FFT>
-
-template <typename T> 
-std::complex<T> RandomCpx() { return std::complex<T>( (T)(rand()/(T)RAND_MAX - .5), (T)(rand()/(T)RAND_MAX - .5) ); }
-
-using namespace std;
-using namespace Eigen;
-
-
-template < typename T>
-complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
-
-complex<long double>  promote(float x) { return complex<long double>((long double)x); }
-complex<long double>  promote(double x) { return complex<long double>((long double)x); }
-complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
-    
-
-    template <typename VT1,typename VT2>
-    long double fft_rmse( const VT1 & fftbuf,const VT2 & timebuf)
-    {
-        long double totalpower=0;
-        long double difpower=0;
-        long double pi = acos((long double)-1 );
-        for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
-            complex<long double> acc = 0;
-            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
-            for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
-                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
-            }
-            totalpower += numext::abs2(acc);
-            complex<long double> x = promote(fftbuf[k0]); 
-            complex<long double> dif = acc - x;
-            difpower += numext::abs2(dif);
-            //cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(numext::abs2(dif)) << endl;
-        }
-        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
-        return sqrt(difpower/totalpower);
-    }
-
-    template <typename VT1,typename VT2>
-    long double dif_rmse( const VT1 buf1,const VT2 buf2)
-    {
-        long double totalpower=0;
-        long double difpower=0;
-        size_t n = (min)( buf1.size(),buf2.size() );
-        for (size_t k=0;k<n;++k) {
-            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
-            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
-        }
-        return sqrt(difpower/totalpower);
-    }
-
-enum { StdVectorContainer, EigenVectorContainer };
-
-template<int Container, typename Scalar> struct VectorType;
-
-template<typename Scalar> struct VectorType<StdVectorContainer,Scalar>
-{
-  typedef vector<Scalar> type;
-};
-
-template<typename Scalar> struct VectorType<EigenVectorContainer,Scalar>
-{
-  typedef Matrix<Scalar,Dynamic,1> type;
-};
-
-template <int Container, typename T>
-void test_scalar_generic(int nfft)
-{
-    typedef typename FFT<T>::Complex Complex;
-    typedef typename FFT<T>::Scalar Scalar;
-    typedef typename VectorType<Container,Scalar>::type ScalarVector;
-    typedef typename VectorType<Container,Complex>::type ComplexVector;
-
-    FFT<T> fft;
-    ScalarVector tbuf(nfft);
-    ComplexVector freqBuf;
-    for (int k=0;k<nfft;++k)
-        tbuf[k]= (T)( rand()/(double)RAND_MAX - .5);
-
-    // make sure it DOESN'T give the right full spectrum answer
-    // if we've asked for half-spectrum
-    fft.SetFlag(fft.HalfSpectrum );
-    fft.fwd( freqBuf,tbuf);
-    VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
-    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
-
-    fft.ClearFlag(fft.HalfSpectrum );
-    fft.fwd( freqBuf,tbuf);
-    VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
-    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
-
-    if (nfft&1)
-        return; // odd FFTs get the wrong size inverse FFT
-
-    ScalarVector tbuf2;
-    fft.inv( tbuf2 , freqBuf);
-    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
-
-
-    // verify that the Unscaled flag takes effect
-    ScalarVector tbuf3;
-    fft.SetFlag(fft.Unscaled);
-
-    fft.inv( tbuf3 , freqBuf);
-
-    for (int k=0;k<nfft;++k)
-        tbuf3[k] *= T(1./nfft);
-
-
-    //for (size_t i=0;i<(size_t) tbuf.size();++i)
-    //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
-
-    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
-
-    // verify that ClearFlag works
-    fft.ClearFlag(fft.Unscaled);
-    fft.inv( tbuf2 , freqBuf);
-    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
-}
-
-template <typename T>
-void test_scalar(int nfft)
-{
-  test_scalar_generic<StdVectorContainer,T>(nfft);
-  //test_scalar_generic<EigenVectorContainer,T>(nfft);
-}
-
-
-template <int Container, typename T>
-void test_complex_generic(int nfft)
-{
-    typedef typename FFT<T>::Complex Complex;
-    typedef typename VectorType<Container,Complex>::type ComplexVector;
-
-    FFT<T> fft;
-
-    ComplexVector inbuf(nfft);
-    ComplexVector outbuf;
-    ComplexVector buf3;
-    for (int k=0;k<nfft;++k)
-        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
-    fft.fwd( outbuf , inbuf);
-
-    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
-    fft.inv( buf3 , outbuf);
-
-    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
-
-    // verify that the Unscaled flag takes effect
-    ComplexVector buf4;
-    fft.SetFlag(fft.Unscaled);
-    fft.inv( buf4 , outbuf);
-    for (int k=0;k<nfft;++k)
-        buf4[k] *= T(1./nfft);
-    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
-
-    // verify that ClearFlag works
-    fft.ClearFlag(fft.Unscaled);
-    fft.inv( buf3 , outbuf);
-    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
-}
-
-template <typename T>
-void test_complex(int nfft)
-{
-  test_complex_generic<StdVectorContainer,T>(nfft);
-  test_complex_generic<EigenVectorContainer,T>(nfft);
-}
-/*
-template <typename T,int nrows,int ncols>
-void test_complex2d()
-{
-    typedef typename Eigen::FFT<T>::Complex Complex;
-    FFT<T> fft;
-    Eigen::Matrix<Complex,nrows,ncols> src,src2,dst,dst2;
-
-    src = Eigen::Matrix<Complex,nrows,ncols>::Random();
-    //src =  Eigen::Matrix<Complex,nrows,ncols>::Identity();
-
-    for (int k=0;k<ncols;k++) {
-        Eigen::Matrix<Complex,nrows,1> tmpOut;
-        fft.fwd( tmpOut,src.col(k) );
-        dst2.col(k) = tmpOut;
-    }
-
-    for (int k=0;k<nrows;k++) {
-        Eigen::Matrix<Complex,1,ncols> tmpOut;
-        fft.fwd( tmpOut,  dst2.row(k) );
-        dst2.row(k) = tmpOut;
-    }
-
-    fft.fwd2(dst.data(),src.data(),ncols,nrows);
-    fft.inv2(src2.data(),dst.data(),ncols,nrows);
-    VERIFY( (src-src2).norm() < test_precision<T>() );
-    VERIFY( (dst-dst2).norm() < test_precision<T>() );
-}
-*/
-
-
-void test_return_by_value(int len)
-{
-    VectorXf in;
-    VectorXf in1;
-    in.setRandom( len );
-    VectorXcf out1,out2;
-    FFT<float> fft;
-
-    fft.SetFlag(fft.HalfSpectrum );
-
-    fft.fwd(out1,in);
-    out2 = fft.fwd(in);
-    VERIFY( (out1-out2).norm() < test_precision<float>() );
-    in1 = fft.inv(out1);
-    VERIFY( (in1-in).norm() < test_precision<float>() );
-}
-
-EIGEN_DECLARE_TEST(FFTW)
-{
-  CALL_SUBTEST( test_return_by_value(32) );
-  //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
-  //CALL_SUBTEST( ( test_complex2d<long double,4,8> () ) );
-  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); 
-  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); 
-  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); 
-  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); 
-  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); 
-  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); 
-  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); 
-
-  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); 
-  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); 
-  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); 
-  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); 
-  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); 
-  
-  #ifdef EIGEN_HAS_FFTWL
-  CALL_SUBTEST( test_complex<long double>(32) );
-  CALL_SUBTEST( test_complex<long double>(256) );
-  CALL_SUBTEST( test_complex<long double>(3*8) );
-  CALL_SUBTEST( test_complex<long double>(5*32) );
-  CALL_SUBTEST( test_complex<long double>(2*3*4) );
-  CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
-  CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
-  
-  CALL_SUBTEST( test_scalar<long double>(32) );
-  CALL_SUBTEST( test_scalar<long double>(45) );
-  CALL_SUBTEST( test_scalar<long double>(50) );
-  CALL_SUBTEST( test_scalar<long double>(256) );
-  CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
-  #endif
-}
+#define EIGEN_FFTW_DEFAULT 1 
+#include "fft_test_shared.h"
--- a/unsupported/test/fft_test_shared.h
+++ b/unsupported/test/fft_test_shared.h
@ -0,0 +1,279 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/FFT>
+
+template <typename T> 
+std::complex<T> RandomCpx() { return std::complex<T>( (T)(rand()/(T)RAND_MAX - .5), (T)(rand()/(T)RAND_MAX - .5) ); }
+
+using namespace std;
+using namespace Eigen;
+
+
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>((long double)x); }
+complex<long double>  promote(double x) { return complex<long double>((long double)x); }
+complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
+    
+
+    template <typename VT1,typename VT2>
+    long double fft_rmse( const VT1 & fftbuf,const VT2 & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        long double pi = acos((long double)-1 );
+        for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
+            for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += numext::abs2(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += numext::abs2(dif);
+            //cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(numext::abs2(dif)) << endl;
+        }
+        // cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename VT1,typename VT2>
+    long double dif_rmse( const VT1 buf1,const VT2 buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = (min)( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
+        }
+        return sqrt(difpower/totalpower);
+    }
+
+enum { StdVectorContainer, EigenVectorContainer };
+
+template<int Container, typename Scalar> struct VectorType;
+
+template<typename Scalar> struct VectorType<StdVectorContainer,Scalar>
+{
+  typedef vector<Scalar> type;
+};
+
+template<typename Scalar> struct VectorType<EigenVectorContainer,Scalar>
+{
+  typedef Matrix<Scalar,Dynamic,1> type;
+};
+
+template <int Container, typename T>
+void test_scalar_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename FFT<T>::Scalar Scalar;
+    typedef typename VectorType<Container,Scalar>::type ScalarVector;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
+
+    FFT<T> fft;
+    ScalarVector tbuf(nfft);
+    ComplexVector freqBuf;
+    for (int k=0;k<nfft;++k)
+        tbuf[k]= (T)( rand()/(double)RAND_MAX - .5);
+
+    // make sure it DOESN'T give the right full spectrum answer
+    // if we've asked for half-spectrum
+    fft.SetFlag(fft.HalfSpectrum );
+    fft.fwd( freqBuf,tbuf);
+    VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
+
+    fft.ClearFlag(fft.HalfSpectrum );
+    fft.fwd( freqBuf,tbuf);
+    VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
+
+    if (nfft&1)
+        return; // odd FFTs get the wrong size inverse FFT
+
+    ScalarVector tbuf2;
+    fft.inv( tbuf2 , freqBuf);
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
+
+
+    // verify that the Unscaled flag takes effect
+    ScalarVector tbuf3;
+    fft.SetFlag(fft.Unscaled);
+
+    fft.inv( tbuf3 , freqBuf);
+
+    for (int k=0;k<nfft;++k)
+        tbuf3[k] *= T(1./nfft);
+
+
+    //for (size_t i=0;i<(size_t) tbuf.size();++i)
+    //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
+
+    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
+
+    // verify that ClearFlag works
+    fft.ClearFlag(fft.Unscaled);
+    fft.inv( tbuf2 , freqBuf);
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
+}
+
+template <typename T>
+void test_scalar(int nfft)
+{
+  test_scalar_generic<StdVectorContainer,T>(nfft);
+  //test_scalar_generic<EigenVectorContainer,T>(nfft);
+}
+
+
+template <int Container, typename T>
+void test_complex_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
+
+    FFT<T> fft;
+
+    ComplexVector inbuf(nfft);
+    ComplexVector outbuf;
+    ComplexVector buf3;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
+
+    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
+
+    // verify that the Unscaled flag takes effect
+    ComplexVector buf4;
+    fft.SetFlag(fft.Unscaled);
+    fft.inv( buf4 , outbuf);
+    for (int k=0;k<nfft;++k)
+        buf4[k] *= T(1./nfft);
+    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
+
+    // verify that ClearFlag works
+    fft.ClearFlag(fft.Unscaled);
+    fft.inv( buf3 , outbuf);
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
+}
+
+template <typename T>
+void test_complex(int nfft)
+{
+  test_complex_generic<StdVectorContainer,T>(nfft);
+  test_complex_generic<EigenVectorContainer,T>(nfft);
+}
+
+template <typename T,int nrows,int ncols>
+void test_complex2d()
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    FFT<T> fft;
+    Eigen::Matrix<Complex,nrows,ncols> src,src2,dst,dst2;
+
+    src = Eigen::Matrix<Complex,nrows,ncols>::Random();
+    //src =  Eigen::Matrix<Complex,nrows,ncols>::Identity();
+
+    for (int k=0;k<ncols;k++) {
+        Eigen::Matrix<Complex,nrows,1> tmpOut;
+        fft.fwd( tmpOut,src.col(k) );
+        dst2.col(k) = tmpOut;
+    }
+
+    for (int k=0;k<nrows;k++) {
+        Eigen::Matrix<Complex,1,ncols> tmpOut;
+        fft.fwd( tmpOut,  dst2.row(k) );
+        dst2.row(k) = tmpOut;
+    }
+
+    fft.fwd2(dst.data(),src.data(),ncols,nrows);
+    fft.inv2(src2.data(),dst.data(),ncols,nrows);
+    VERIFY( (src-src2).norm() < test_precision<T>() );
+    VERIFY( (dst-dst2).norm() < test_precision<T>() );
+}
+
+void test_return_by_value(int len)
+{
+    VectorXf in;
+    VectorXf in1;
+    in.setRandom( len );
+    VectorXcf out1,out2;
+    FFT<float> fft;
+
+    fft.SetFlag(fft.HalfSpectrum );
+
+    fft.fwd(out1,in);
+    out2 = fft.fwd(in);
+    VERIFY( (out1-out2).norm() < test_precision<float>() );
+    in1 = fft.inv(out1);
+    VERIFY( (in1-in).norm() < test_precision<float>() );
+}
+
+EIGEN_DECLARE_TEST(FFTW)
+{
+  CALL_SUBTEST( test_return_by_value(32) );
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); 
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); 
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); 
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); 
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); 
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); 
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); 
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); 
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); 
+  
+  #if defined EIGEN_HAS_FFTWL || defined EIGEN_POCKETFFT_DEFAULT 
+  CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<long double>(256) );
+  CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+  
+  CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<long double>(50) );
+  CALL_SUBTEST( test_scalar<long double>(256) );
+  CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+
+  CALL_SUBTEST( ( test_complex2d<long double, 2*3*4, 2*3*4> () ) );
+  CALL_SUBTEST( ( test_complex2d<long double, 3*4*5, 3*4*5> () ) );
+  CALL_SUBTEST( ( test_complex2d<long double, 24, 60> () ) );
+  CALL_SUBTEST( ( test_complex2d<long double, 60, 24> () ) );
+  // fail to build since Eigen limit the stack allocation size,too big here.
+  // CALL_SUBTEST( ( test_complex2d<long double, 256, 256> () ) ); 
+
+  #endif
+
+  #if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT
+  CALL_SUBTEST( ( test_complex2d<float, 24, 24> () ) );
+  CALL_SUBTEST( ( test_complex2d<float, 60, 60> () ) );
+  CALL_SUBTEST( ( test_complex2d<float, 24, 60> () ) );
+  CALL_SUBTEST( ( test_complex2d<float, 60, 24> () ) );
+
+  CALL_SUBTEST( ( test_complex2d<double, 24, 24> () ) );
+  CALL_SUBTEST( ( test_complex2d<double, 60, 60> () ) );
+  CALL_SUBTEST( ( test_complex2d<double, 24, 60> () ) );
+  CALL_SUBTEST( ( test_complex2d<double, 60, 24> () ) );
+  #endif
+
+}
--- a/unsupported/test/pocketfft.cpp
+++ b/unsupported/test/pocketfft.cpp
@ -0,0 +1,2 @@
+#define EIGEN_POCKETFFT_DEFAULT 1
+#include "fft_test_shared.h"