From 67ce07ea831fa2df4081436c0e8c4bfcd77d6ede Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Sat, 20 Feb 2010 14:45:50 +0000 Subject: [PATCH 1/8] matrix_function test: replace expm(A).inverse() by expm(-A) The latter is more stable. This fixes one of the issues with the test. Also, make typedef's in MatrixFunctionReturnValue public; this is necessary to get the test to compile. --- .../Eigen/src/MatrixFunctions/MatrixFunction.h | 6 ++---- unsupported/test/matrix_function.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index d63bcbce9..16fce5b29 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -492,14 +492,12 @@ typename MatrixFunction::DynMatrixType MatrixFunction class MatrixFunctionReturnValue : public ReturnByValue > { - private: + public: typedef typename ei_traits::Scalar Scalar; typedef typename ei_stem_function::type StemFunction; - public: - - /** \brief Constructor. + /** \brief Constructor. * * \param[in] A %Matrix (expression) forming the argument of the * matrix function. diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 4ff6d7f1e..7a1501da2 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -109,11 +109,10 @@ template void testHyperbolicFunctions(const MatrixType& A) { for (int i = 0; i < g_repeat; i++) { - MatrixType sinhA = ei_matrix_sinh(A); - MatrixType coshA = ei_matrix_cosh(A); MatrixType expA = ei_matrix_exponential(A); - VERIFY_IS_APPROX(sinhA, (expA - expA.inverse())/2); - VERIFY_IS_APPROX(coshA, (expA + expA.inverse())/2); + MatrixType expmA = ei_matrix_exponential(-A); + VERIFY_IS_APPROX(ei_matrix_sinh(A), (expA - expmA) / 2); + VERIFY_IS_APPROX(ei_matrix_cosh(A), (expA + expmA) / 2); } } @@ -134,14 +133,15 @@ void testGonioFunctions(const MatrixType& A) ComplexMatrix Ac = A.template cast(); ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac); + ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac); MatrixType sinA = ei_matrix_sin(A); ComplexMatrix sinAc = sinA.template cast(); - VERIFY_IS_APPROX(sinAc, (exp_iA - exp_iA.inverse()) / (two*imagUnit)); + VERIFY_IS_APPROX(sinAc, (exp_iA - exp_miA) / (two*imagUnit)); MatrixType cosA = ei_matrix_cos(A); ComplexMatrix cosAc = cosA.template cast(); - VERIFY_IS_APPROX(cosAc, (exp_iA + exp_iA.inverse()) / 2); + VERIFY_IS_APPROX(cosAc, (exp_iA + exp_miA) / 2); } } From 4e389b195d51fc6a01bfc5efe06751d84c37d0bf Mon Sep 17 00:00:00 2001 From: Jitse Niesen Date: Sat, 20 Feb 2010 16:27:04 +0000 Subject: [PATCH 2/8] Change MatrixFunction::separation() parameter from 0.01 to 0.1 . The latter is actually the value used in the literature. --- unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 16fce5b29..b4b652235 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -178,9 +178,9 @@ class MatrixFunction * * This is morally a \c static \c const \c Scalar, but only * integers can be static constant class members in C++. The - * separation constant is set to 0.01, a value taken from the + * separation constant is set to 0.1, a value taken from the * paper by Davies and Higham. */ - static const RealScalar separation() { return static_cast(0.01); } + static const RealScalar separation() { return static_cast(0.1); } }; /** \brief Constructor. From f797ba0abe5307e5b42069b316925b0cea53b188 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 22 Feb 2010 11:04:35 +0100 Subject: [PATCH 3/8] extend the bench timer to allow benchmarking of parallel code, improvements are welcome --- bench/BenchTimer.h | 93 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index b2d0fc5f6..6889654fe 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -23,24 +23,31 @@ // License and a copy of the GNU General Public License along with // Eigen. If not, see . -#ifndef EIGEN_BENCH_TIMER_H -#define EIGEN_BENCH_TIMER_H +#ifndef EIGEN_BENCH_TIMERR_H +#define EIGEN_BENCH_TIMERR_H #if defined(_WIN32) || defined(__CYGWIN__) #define NOMINMAX #define WIN32_LEAN_AND_MEAN #include #else +#include #include #include #endif +#include #include #include namespace Eigen { +enum { + CPU_TIMER = 0, + REAL_TIMER = 1 +}; + /** Elapsed time timer keeping the best try. * * On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID. @@ -52,37 +59,58 @@ class BenchTimer { public: - BenchTimer() - { + BenchTimer() + { #if defined(_WIN32) || defined(__CYGWIN__) LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); m_frequency = (double)freq.QuadPart; #endif - reset(); + reset(); } ~BenchTimer() {} - inline void reset(void) {m_best = 1e6;} - inline void start(void) {m_start = getTime();} - inline void stop(void) + inline void reset() { - m_best = std::min(m_best, getTime() - m_start); + m_bests.fill(1e9); + m_totals.setZero(); + } + inline void start() + { + m_starts[CPU_TIMER] = getCpuTime(); + m_starts[REAL_TIMER] = getRealTime(); + } + inline void stop() + { + m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER]; + m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER]; + m_bests = m_bests.cwiseMin(m_times); + m_totals += m_times; } - /** Return the best elapsed time in seconds. + /** Return the elapsed time in seconds between the last start/stop pair */ - inline double value(void) + inline double value(int TIMER = CPU_TIMER) { - return m_best; + return m_times[TIMER]; } -#if defined(_WIN32) || defined(__CYGWIN__) - inline double getTime(void) -#else - static inline double getTime(void) -#endif + /** Return the best elapsed time in seconds + */ + inline double best(int TIMER = CPU_TIMER) + { + return m_bests[TIMER]; + } + + /** Return the total elapsed time in seconds. + */ + inline double total(int TIMER = CPU_TIMER) + { + return m_totals[TIMER]; + } + + inline double getCpuTime() { #ifdef WIN32 LARGE_INTEGER query_ticks; @@ -95,14 +123,41 @@ public: #endif } + inline double getRealTime() + { +#ifdef WIN32 + // TODO + return getCputime; +#else + struct timeval tv; + struct timezone tz; + gettimeofday(&tv, &tz); + return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec; +#endif + } + protected: #if defined(_WIN32) || defined(__CYGWIN__) double m_frequency; #endif - double m_best, m_start; + Vector2d m_starts; + Vector2d m_times; + Vector2d m_bests; + Vector2d m_totals; }; +#define BENCH(TIMER,TRIES,REP,CODE) { \ + TIMER.reset(); \ + for(int uglyvarname1=0; uglyvarname1 Date: Mon, 22 Feb 2010 11:23:27 +0100 Subject: [PATCH 4/8] Added getRealTime() for windows. --- bench/BenchTimer.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index 6889654fe..a32495d60 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -126,8 +126,9 @@ public: inline double getRealTime() { #ifdef WIN32 - // TODO - return getCputime; + SYSTEMTIME st; + GetSystemTime(&st); + return (double)st.wSecond + 1.e-6 * (double)st.wMilliseconds; #else struct timeval tv; struct timezone tz; From 3e6ab8f93bd794f3f81f08c98098b146791719de Mon Sep 17 00:00:00 2001 From: Hauke Heibel Date: Mon, 22 Feb 2010 11:34:25 +0100 Subject: [PATCH 5/8] ups --- bench/BenchTimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h index a32495d60..e49afa07f 100644 --- a/bench/BenchTimer.h +++ b/bench/BenchTimer.h @@ -128,7 +128,7 @@ public: #ifdef WIN32 SYSTEMTIME st; GetSystemTime(&st); - return (double)st.wSecond + 1.e-6 * (double)st.wMilliseconds; + return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds; #else struct timeval tv; struct timezone tz; From 51a4b929a17ec36c45b1fb814c566098a164b7df Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 22 Feb 2010 15:18:29 +0100 Subject: [PATCH 6/8] implement an even lower level version of the gebp kernel for MSVC (it seems to be faster with gcc as well) (transplanted from 9a5643551fe068497f84a81cd8986febf1918382) --- .../Core/products/GeneralBlockPanelKernel.h | 72 ++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index fe1987bdd..8c29d2218 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -76,6 +76,7 @@ struct ei_gebp_kernel { PacketType B0, B1, B2, B3, A0, A1; + #if 0 A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); @@ -134,6 +135,73 @@ struct ei_gebp_kernel if(nr==4) C3 = cj.pmadd(A0, B3, C3); if(nr==4) C7 = cj.pmadd(A1, B3, C7); + #else + + PacketType T0, T1; + + #define MADD(A,B,C,T) { T = A; T = ei_pmul(T,B); C = ei_padd(C,T); } + + A0 = ei_pload(&blA[0*PacketSize]); + A1 = ei_pload(&blA[1*PacketSize]); + B0 = ei_pload(&blB[0*PacketSize]); + B1 = ei_pload(&blB[1*PacketSize]); + + MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0); + if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); + MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4); + if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); + B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); + MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1); + MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); + if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2); + if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); + if(nr==4) { MADD(A0,B3,C3,T0); }// C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[2*PacketSize]); + if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[3*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); + MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0); + MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4); + B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); + MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1); + MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); + if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2); + if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); + if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[4*PacketSize]); + if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[5*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); + + MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0); + MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4); + B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); + MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1); + MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5); + B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); + if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2); + if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6); + if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); + if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3); + A0 = ei_pload(&blA[6*PacketSize]); + if(nr==4) { MADD(A1,B3,C7,T1); } // C7 = cj.pmadd(A1, B3, C7); + A1 = ei_pload(&blA[7*PacketSize]); + if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); + MADD(A0,B0,C0,T0); // C0 = cj.pmadd(A0, B0, C0); + MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4); + MADD(A0,B1,C1,T0); // C1 = cj.pmadd(A0, B1, C1); + MADD(A1,B1,C5,T1); // C5 = cj.pmadd(A1, B1, C5); + if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2); + if(nr==4) { MADD(A1,B2,C6,T1); }//C6 = cj.pmadd(A1, B2, C6); + if(nr==4) { MADD(A0,B3,C3,T0); }//C3 = cj.pmadd(A0, B3, C3); + if(nr==4) { MADD(A1,B3,C7,T1); }//C7 = cj.pmadd(A1, B3, C7); + + #endif + blB += 4*nr*PacketSize; blA += 4*mr; } @@ -334,7 +402,7 @@ struct ei_gebp_kernel #endif PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - + const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; for(int k=0; k // skip what we have after if(PanelMode) count += PacketSize * nr * (stride-offset-depth); } - + // copy the remaining columns one at a time (nr==1) for(int j2=packet_cols; j2 Date: Mon, 22 Feb 2010 16:35:05 +0100 Subject: [PATCH 7/8] fully adapt the gebp kernel and optimize it for CPU with only 8 registers (transplanted from 2ed88ebbf1995be90b8d0c25ff10248c8f56d023) --- .../Core/products/GeneralBlockPanelKernel.h | 456 ++++++++++-------- 1 file changed, 262 insertions(+), 194 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8c29d2218..18e913b0e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -27,6 +27,12 @@ #ifndef EIGEN_EXTERN_INSTANTIATIONS +#ifdef EIGEN_HAS_FUSE_CJMADD +#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); +#else +#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); +#endif + // optimized GEneral packed Block * packed Panel product kernel template struct ei_gebp_kernel @@ -74,133 +80,111 @@ struct ei_gebp_kernel const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; for(int k=0; k Date: Mon, 22 Feb 2010 15:39:17 +0100 Subject: [PATCH 8/8] provide default values for CXX, remove duplicate define --- bench/bench_unrolling | 3 ++- bench/benchmark_suite | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bench/bench_unrolling b/bench/bench_unrolling index bf01cce7d..826443845 100755 --- a/bench/bench_unrolling +++ b/bench/bench_unrolling @@ -2,10 +2,11 @@ # gcc : CXX="g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000" # icc : CXX="icpc -fast -no-inline-max-size -fno-exceptions" +CXX=${CXX-g++ -finline-limit=10000 -ftemplate-depth-2000 --param max-inline-recursive-depth=2000} # default value for ((i=1; i<16; ++i)); do echo "Matrix size: $i x $i :" - $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=1024 -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null + $CXX -O3 -I.. -DNDEBUG benchmark.cpp -DMATSIZE=$i -DEIGEN_UNROLLING_LIMIT=400 -o benchmark && time ./benchmark >/dev/null $CXX -O3 -I.. -DNDEBUG -finline-limit=10000 benchmark.cpp -DMATSIZE=$i -DEIGEN_DONT_USE_UNROLLED_LOOPS=1 -o benchmark && time ./benchmark >/dev/null echo " " done diff --git a/bench/benchmark_suite b/bench/benchmark_suite index a8fc6dced..3f21d3661 100755 --- a/bench/benchmark_suite +++ b/bench/benchmark_suite @@ -1,4 +1,5 @@ #!/bin/bash +CXX=${CXX-g++} # default value unless caller has defined CXX echo "Fixed size 3x3, column-major, -DNDEBUG" $CXX -O3 -I .. -DNDEBUG benchmark.cpp -o benchmark && time ./benchmark >/dev/null echo "Fixed size 3x3, column-major, with asserts"