From 20cac72b8228de6c129caa983b25facddad0e009 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Feb 2015 22:58:32 +0100 Subject: [PATCH 01/18] Packet must be passed by const reference and not by value to avoid alignment issue. --- Eigen/src/Core/util/BlasUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 3ec55fad2..9bfa45106 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -166,7 +166,7 @@ class BlasLinearMapper { return ploadt(m_data + i); } - EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { pstoret(m_data + i, p); } From 24d65ac0b0121ad70984ce1871cf2a8116b42201 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 18 Feb 2015 01:03:32 +0100 Subject: [PATCH 02/18] Removed redundant typedef which confused old gcc versions. --- test/sizeoverflow.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/sizeoverflow.cpp b/test/sizeoverflow.cpp index 16d6f8d04..240d22294 100644 --- a/test/sizeoverflow.cpp +++ b/test/sizeoverflow.cpp @@ -18,8 +18,6 @@ VERIFY(threw && "should have thrown bad_alloc: " #a); \ } -typedef DenseIndex Index; - template void triggerMatrixBadAlloc(Index rows, Index cols) { From d4eda014889541e3a22680bf236bf814a6fbc813 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 11:24:32 +0100 Subject: [PATCH 03/18] Big 957, workaround MSVC/ICC compilation issue --- Eigen/src/SparseCore/SparseBlock.h | 6 ++++-- Eigen/src/SparseCore/SparseMatrixBase.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 40dc1a2bd..acd82e926 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -292,7 +292,8 @@ const typename SparseMatrixBase::ConstInnerVectorReturnType SparseMatri * is col-major (resp. row-major). */ template -Block SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) +typename SparseMatrixBase::InnerVectorsReturnType +SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) { return Block(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, @@ -304,7 +305,8 @@ Block SparseMatrixBase::innerVectors(Inde * is col-major (resp. row-major). Read-only. */ template -const Block SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const +const typename SparseMatrixBase::ConstInnerVectorsReturnType +SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const { return Block(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index 9039ebcec..d76dfa33d 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -321,8 +321,10 @@ template class SparseMatrixBase : public EigenBase const ConstInnerVectorReturnType innerVector(Index outer) const; // set of inner-vectors - Block innerVectors(Index outerStart, Index outerSize); - const Block innerVectors(Index outerStart, Index outerSize) const; + typedef Block InnerVectorsReturnType; + typedef Block ConstInnerVectorsReturnType; + InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize); + const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const; DenseMatrixType toDense() const { From dc7e6acc05f6d546db401545582bdd13c0331596 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 11:26:25 +0100 Subject: [PATCH 04/18] Fix possible usage of a null pointer in CholmodSupport --- Eigen/src/CholmodSupport/CholmodSupport.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index 8ef0fb3b5..d2b0fb282 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -277,6 +277,7 @@ class CholmodBase : public SparseSolverBase if(!x_cd) { this->m_info = NumericalIssue; + return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) dest = Matrix::Map(reinterpret_cast(x_cd->x),b.rows(),b.cols()); @@ -298,6 +299,7 @@ class CholmodBase : public SparseSolverBase if(!x_cs) { this->m_info = NumericalIssue; + return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) dest = viewAsEigen(*x_cs); From eb563049f7041170976998273e4d48e96b11b08f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 11:26:48 +0100 Subject: [PATCH 05/18] Remove some dead stores. --- Eigen/src/Core/arch/SSE/MathFunctions.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 9ffba5b41..f86c0a39a 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -138,7 +138,6 @@ Packet4f pexp(const Packet4f& _x) #ifdef EIGEN_VECTORIZE_SSE4_1 fx = _mm_floor_ps(fx); #else - tmp = _mm_setzero_ps(); emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); /* if greater, substract 1 */ @@ -207,7 +206,6 @@ Packet2d pexp(const Packet2d& _x) #ifdef EIGEN_VECTORIZE_SSE4_1 fx = _mm_floor_pd(fx); #else - tmp = _mm_setzero_pd(); emm0 = _mm_cvttpd_epi32(fx); tmp = _mm_cvtepi32_pd(emm0); /* if greater, substract 1 */ From 63464754ef747e0c3d16c5da6fd4d4228ab8dd7a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 11:29:54 +0100 Subject: [PATCH 06/18] Add an internal assertion in makeCompressed to catch a possible risk of null-pointer access. --- Eigen/src/SparseCore/SparseMatrix.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 4c8965802..4562f3df9 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -467,6 +467,8 @@ class SparseMatrix if(isCompressed()) return; + eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0); + Index oldStart = m_outerIndex[1]; m_outerIndex[1] = m_innerNonZeros[0]; for(Index j=1; j Date: Wed, 18 Feb 2015 11:30:44 +0100 Subject: [PATCH 07/18] Workaround dead store warnings in unit tests. --- test/cholesky.cpp | 7 +++++-- test/eigensolver_complex.cpp | 1 + test/eigensolver_generic.cpp | 1 + test/eigensolver_selfadjoint.cpp | 10 ++++------ test/inverse.cpp | 6 +++++- test/product_notemporary.cpp | 5 +++-- test/product_selfadjoint.cpp | 9 ++++++++- test/product_syrk.cpp | 5 +++-- test/product_trmv.cpp | 6 ++++-- 9 files changed, 34 insertions(+), 16 deletions(-) diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 33e32a322..9335270f4 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -380,10 +380,14 @@ void test_cholesky() CALL_SUBTEST_3( cholesky_definiteness(Matrix2d()) ); CALL_SUBTEST_4( cholesky(Matrix3f()) ); CALL_SUBTEST_5( cholesky(Matrix4d()) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE); + + s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_2( cholesky(MatrixXd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_6( cholesky_cplx(MatrixXcd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) } CALL_SUBTEST_4( cholesky_verify_assert() ); @@ -395,6 +399,5 @@ void test_cholesky() CALL_SUBTEST_9( LLT(10) ); CALL_SUBTEST_9( LDLT(10) ); - TEST_SET_BUT_UNUSED_VARIABLE(s) TEST_SET_BUT_UNUSED_VARIABLE(nb_temporaries) } diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp index c9d8c0877..bf8d2deb0 100644 --- a/test/eigensolver_complex.cpp +++ b/test/eigensolver_complex.cpp @@ -108,6 +108,7 @@ void test_eigensolver_complex() CALL_SUBTEST_2( eigensolver(MatrixXcd(s,s)) ); CALL_SUBTEST_3( eigensolver(Matrix, 1, 1>()) ); CALL_SUBTEST_4( eigensolver(Matrix3f()) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) } CALL_SUBTEST_1( eigensolver_verify_assert(Matrix4cf()) ); s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index 92d33f66a..c5441ac4e 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -93,6 +93,7 @@ void test_eigensolver_generic() CALL_SUBTEST_1( eigensolver(Matrix4f()) ); s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); CALL_SUBTEST_2( eigensolver(MatrixXd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) // some trivial but implementation-wise tricky cases CALL_SUBTEST_2( eigensolver(MatrixXd(1,1)) ); diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 935736328..7b0077a6d 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -154,15 +154,13 @@ void test_eigensolver_selfadjoint() CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) ); CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) ); CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); - CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); - CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(s,s)) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); - CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(s,s)) ); s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); + CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) ); + CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(s,s)) ); + CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(s,s)) ); CALL_SUBTEST_9( selfadjointeigensolver(Matrix,Dynamic,Dynamic,RowMajor>(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) // some trivial but implementation-wise tricky cases CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) ); diff --git a/test/inverse.cpp b/test/inverse.cpp index 1e7b20958..b09989aca 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -102,12 +102,16 @@ void test_inverse() CALL_SUBTEST_3( inverse(Matrix3f()) ); CALL_SUBTEST_4( inverse(Matrix4f()) ); CALL_SUBTEST_4( inverse(Matrix()) ); + s = internal::random(50,320); CALL_SUBTEST_5( inverse(MatrixXf(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(25,100); CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + CALL_SUBTEST_7( inverse(Matrix4d()) ); CALL_SUBTEST_7( inverse(Matrix()) ); } - TEST_SET_BUT_UNUSED_VARIABLE(s) } diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 805cc8939..898f1d1cb 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -129,11 +129,12 @@ void test_product_notemporary() for(int i = 0; i < g_repeat; i++) { s = internal::random(16,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_1( product_notemporary(MatrixXf(s, s)) ); - s = internal::random(16,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_2( product_notemporary(MatrixXd(s, s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(16,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_3( product_notemporary(MatrixXcf(s,s)) ); - s = internal::random(16,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_4( product_notemporary(MatrixXcd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) } } diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp index 374e2393b..3d768aa7e 100644 --- a/test/product_selfadjoint.cpp +++ b/test/product_selfadjoint.cpp @@ -67,14 +67,21 @@ void test_product_selfadjoint() CALL_SUBTEST_1( product_selfadjoint(Matrix()) ); CALL_SUBTEST_2( product_selfadjoint(Matrix()) ); CALL_SUBTEST_3( product_selfadjoint(Matrix3d()) ); + s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_4( product_selfadjoint(MatrixXcf(s, s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_5( product_selfadjoint(MatrixXcd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_6( product_selfadjoint(MatrixXd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_7( product_selfadjoint(Matrix(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) } - TEST_SET_BUT_UNUSED_VARIABLE(s) } diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp index 73c95000c..e10f0f2f2 100644 --- a/test/product_syrk.cpp +++ b/test/product_syrk.cpp @@ -125,11 +125,12 @@ void test_product_syrk() int s; s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_1( syrk(MatrixXf(s, s)) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_2( syrk(MatrixXd(s, s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) } } diff --git a/test/product_trmv.cpp b/test/product_trmv.cpp index 4c3c435c2..57a202afc 100644 --- a/test/product_trmv.cpp +++ b/test/product_trmv.cpp @@ -78,12 +78,14 @@ void test_product_trmv() CALL_SUBTEST_1( trmv(Matrix()) ); CALL_SUBTEST_2( trmv(Matrix()) ); CALL_SUBTEST_3( trmv(Matrix3d()) ); + s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_4( trmv(MatrixXcf(s,s)) ); - s = internal::random(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_5( trmv(MatrixXcd(s,s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) + s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_6( trmv(Matrix(s, s)) ); + TEST_SET_BUT_UNUSED_VARIABLE(s) } - TEST_SET_BUT_UNUSED_VARIABLE(s); } From 6f4adc9e9428c383236d60004a64fddfbeaf254f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 11:40:06 +0100 Subject: [PATCH 08/18] Add missing install directives for arch/CUDA --- Eigen/src/Core/arch/CMakeLists.txt | 8 ++++++-- Eigen/src/Core/arch/CUDA/CMakeLists.txt | 6 ++++++ 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 Eigen/src/Core/arch/CUDA/CMakeLists.txt diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt index 0db8c558d..42b0b486e 100644 --- a/Eigen/src/Core/arch/CMakeLists.txt +++ b/Eigen/src/Core/arch/CMakeLists.txt @@ -1,5 +1,9 @@ -ADD_SUBDIRECTORY(SSE) ADD_SUBDIRECTORY(AltiVec) -ADD_SUBDIRECTORY(NEON) ADD_SUBDIRECTORY(AVX) +ADD_SUBDIRECTORY(CUDA) ADD_SUBDIRECTORY(Default) +ADD_SUBDIRECTORY(NEON) +ADD_SUBDIRECTORY(SSE) + + + diff --git a/Eigen/src/Core/arch/CUDA/CMakeLists.txt b/Eigen/src/Core/arch/CUDA/CMakeLists.txt new file mode 100644 index 000000000..7ba28da7c --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_CUDA_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_CUDA_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/CUDA COMPONENT Devel +) From 548b7813805d9e314f97eb6f731d711df663a46b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 12:53:49 +0100 Subject: [PATCH 09/18] Fix bug #945: workaround MSVC warning --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 15bf04d1f..ce5494182 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1788,14 +1788,14 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1] = dm1.loadPacket(k); - kernel.packet[2] = dm2.loadPacket(k); - kernel.packet[3] = dm3.loadPacket(k); + kernel.packet[1%PacketSize] = dm1.loadPacket(k); + kernel.packet[2%PacketSize] = dm2.loadPacket(k); + kernel.packet[3%PacketSize] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); - pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); - pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2])); - pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize])); count+=4*PacketSize; } } From c7bb1e8ea8dfc984788d0cb77b82a90468393c2e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 15:19:23 +0100 Subject: [PATCH 10/18] Fix a regression when using OpenMP, and fix bug #714: the number of threads might be lower than the number of requested ones --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 25 ++++++++++++++++--- Eigen/src/Core/products/Parallelizer.h | 19 ++++++++------ test/product_large.cpp | 3 +-- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 44e44b986..c38c12c31 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -217,8 +217,9 @@ struct gemm_functor : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {} - void initParallelSession() const + void initParallelSession(Index num_threads) const { + m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads); m_blocking.allocateA(); } @@ -276,7 +277,7 @@ class level3_blocking }; template -class gemm_blocking_space +class gemm_blocking_space : public level3_blocking< typename conditional::type, typename conditional::type> @@ -299,7 +300,7 @@ class gemm_blocking_spacem_mc = ActualRows; this->m_nc = ActualCols; @@ -307,6 +308,9 @@ class gemm_blocking_spacem_blockA = m_staticA; this->m_blockB = m_staticB; } + + void initParallel(Index, Index, Index, Index) + {} inline void allocateA() {} inline void allocateB() {} @@ -331,7 +335,7 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; @@ -351,6 +355,19 @@ class gemm_blocking_spacem_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } + + void initParallel(Index rows, Index cols, Index depth, Index num_threads) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0); + Index m = this->m_mc; + computeProductBlockingSizes(this->m_kc, m, this->m_nc, num_threads); + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } void allocateA() { diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 2b90abf8f..91d37a123 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -120,25 +120,28 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos return func(0,rows, 0,cols); Eigen::initParallel(); - func.initParallelSession(); + func.initParallelSession(threads); if(transpose) std::swap(rows,cols); - - Index blockCols = (cols / threads) & ~Index(0x3); - Index blockRows = (rows / threads); - blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; ei_declare_aligned_stack_constructed_variable(GemmParallelInfo,info,threads,0); - + #pragma omp parallel num_threads(threads) { Index i = omp_get_thread_num(); + // Note that the actual number of threads might be lower than the number of request ones. + Index actual_threads = omp_get_num_threads(); + + Index blockCols = (cols / actual_threads) & ~Index(0x3); + Index blockRows = (rows / actual_threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; + Index r0 = i*blockRows; - Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows; + Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; Index c0 = i*blockCols; - Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols; + Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols; info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; diff --git a/test/product_large.cpp b/test/product_large.cpp index ffb8b7bf2..84c489580 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -64,8 +64,7 @@ void test_product_large() #endif // Regression test for bug 714: -#ifdef EIGEN_HAS_OPENMP - std::cout << "Testing omp_set_dynamic(1)\n"; +#if defined EIGEN_HAS_OPENMP omp_set_dynamic(1); for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_6( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); From 4a3e6c8be1d4752b3172a7e26631c4669e28dde7 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 18 Feb 2015 09:43:55 -0500 Subject: [PATCH 11/18] bug #958 - Allow testing specific blocking sizes This is only a debugging/testing patch. It allows testing specific product blocking sizes, typically to study the impact on performance. Example usage: int testk, testm, testn; #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K testk #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M testm #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N testn #include --- .../src/Core/products/GeneralBlockPanelKernel.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ce5494182..f3fede441 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -84,6 +84,22 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff template void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + EIGEN_UNUSED_VARIABLE(num_threads); + typedef gebp_traits Traits; + enum { + kr = 16, + mr = Traits::mr, + nr = Traits::nr + }; + k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + if (k > kr) k -= k % kr; + m = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + if (m > mr) m -= m % mr; + n = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + if (n > nr) n -= n % nr; + return; +#endif // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed From fc5c3e85e2a59a3b366970793195538969462a64 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 15:47:01 +0100 Subject: [PATCH 12/18] Fix bug #961: eigen-doc.tgz included part of itself. --- doc/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 46e5fc9d7..4d01a0424 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -100,7 +100,8 @@ add_custom_target(doc ALL COMMAND ${CMAKE_COMMAND} -E copy ${Eigen_BINARY_DIR}/doc/html/group__TopicUnalignedArrayAssert.html ${Eigen_BINARY_DIR}/doc/html/TopicUnalignedArrayAssert.html COMMAND ${CMAKE_COMMAND} -E rename html eigen-doc COMMAND ${CMAKE_COMMAND} -E remove eigen-doc/eigen-doc.tgz - COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc/eigen-doc.tgz eigen-doc + COMMAND ${CMAKE_COMMAND} -E tar cfz eigen-doc.tgz eigen-doc + COMMAND ${CMAKE_COMMAND} -E rename eigen-doc.tgz eigen-doc/eigen-doc.tgz COMMAND ${CMAKE_COMMAND} -E rename eigen-doc html WORKING_DIRECTORY ${Eigen_BINARY_DIR}/doc) From 63eb0f6fe6c7223c3bb2a2ea9495fddcc1e4b6f2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Feb 2015 15:49:05 +0100 Subject: [PATCH 13/18] Clean a bit computeProductBlockingSizes (use Index type, remove CEIL macro) --- .../Core/products/GeneralBlockPanelKernel.h | 55 +++++++++---------- Eigen/src/Core/util/Meta.h | 8 +++ 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ce5494182..dc679b3fe 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -79,18 +79,15 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ -#define CEIL(a, b) ((a)+(b)-1)/(b) -template -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) +template +void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { // Explanations: - // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and - // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed - // per kc x nr vertical small panels where nr is the blocking size along the n dimension - // at the register level. For vectorization purpose, these small vertical panels are unpacked, - // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to - // stay in L1 cache. + // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and + // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed + // per mr x kc horizontal small panels where mr is the blocking size along the m dimension + // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); @@ -108,32 +105,32 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_ nr = Traits::nr, nr_mask = (0xffffffff/nr)*nr }; - SizeType k_cache = (l1-ksub)/kdiv; + Index k_cache = (l1-ksub)/kdiv; if (k_cache < k) { k = k_cache & k_mask; - eigen_assert(k > 0); + eigen_internal_assert(k > 0); } - SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); - SizeType n_per_thread = CEIL(n, num_threads); + Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + Index n_per_thread = numext::div_ceil(n, num_threads); if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. - eigen_assert(n_cache >= static_cast(nr)); + eigen_internal_assert(n_cache >= static_cast(nr)); n = n_cache & nr_mask; - eigen_assert(n > 0); + eigen_internal_assert(n > 0); } else { - n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); } if (l3 > l2) { // l3 is shared between all cores, so we'll give each thread its own chunk of l3. - SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); - SizeType m_per_thread = CEIL(m, num_threads); - if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + Index m_per_thread = numext::div_ceil(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { m = m_cache & mr_mask; - eigen_assert(m > 0); + eigen_internal_assert(m > 0); } else { - m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); } } } @@ -141,19 +138,19 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_ // In unit tests we do not want to use extra large matrices, // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); #else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); #endif } } -template -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) +template +inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { computeProductBlockingSizes(k, m, n, num_threads); } diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index f3bafd5af..3be9e6ca5 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -284,6 +284,14 @@ template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif +// Integer division with rounding up. +// T is assumed to be an integer type with a>=0, and b>0 +template +T div_ceil(const T &a, const T &b) +{ + return (a+b-1) / b; +} + } // end namespace numext } // end namespace Eigen From ee27d50633a676986fc0841dcf5d06bed0d4bd8b Mon Sep 17 00:00:00 2001 From: Hauke Heibel Date: Wed, 18 Feb 2015 18:51:08 +0100 Subject: [PATCH 14/18] Fixed template parameter. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 12948a20c..c8a1dcced 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -91,11 +91,11 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads mr = Traits::mr, nr = Traits::nr }; - k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); if (k > kr) k -= k % kr; - m = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + m = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); if (m > mr) m -= m % mr; - n = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + n = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); if (n > nr) n -= n % nr; return; #endif From 9bd8a4bab58231a1d3afe0dd43a7c72f217dfec1 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 18 Feb 2015 15:03:35 -0500 Subject: [PATCH 15/18] bug #955 - Implement a rotating kernel alternative in the 3px4 gebp path This is substantially faster on ARM, where it's important to minimize the number of loads. This is specific to the case where all packet types are of size 4. I made my best attempt to minimize how dirty this is... opinions welcome. Eventually one could have a generic rotated kernel, but it would take some work to get there. Also, on sandy bridge, in my experience, it's not beneficial (even about 1% slower). --- Eigen/src/Core/GenericPacketMath.h | 15 ++++ Eigen/src/Core/arch/NEON/PacketMath.h | 25 ++++++ Eigen/src/Core/arch/SSE/PacketMath.h | 23 +++++ .../Core/products/GeneralBlockPanelKernel.h | 86 +++++++++++++++---- Eigen/src/Core/util/StaticAssert.h | 3 +- 5 files changed, 133 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 74e1174ae..967a07df5 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -287,6 +287,21 @@ template EIGEN_DEVICE_FUNC inline typename unpacket_traits EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } +template +struct protate_impl +{ + static Packet run(const Packet& a) { return a; } +}; + +/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, + * by the given offset, e.g. for offset == 1: + * (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1]) + */ +template EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) +{ + EIGEN_STATIC_ASSERT(offset < unpacket_traits::size, ROTATION_BY_ILLEGAL_OFFSET); + return offset ? protate_impl::run(a) : a; +} /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 8149aed7f..e9af45f22 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -309,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { a_hi = vget_high_s32(a_r64); return vcombine_s32(a_hi, a_lo); } + +template +struct protate_impl +{ + static Packet4f run(const Packet4f& a) { + return vextq_f32(a, a, offset); + } +}; + +template +struct protate_impl +{ + static Packet4i run(const Packet4i& a) { + return vextq_s32(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } @@ -625,6 +642,14 @@ template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { retu template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template +struct protate_impl +{ + static Packet2d run(const Packet2d& a) { + return vextq_f64(a, a, offset); + } +}; + template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b5a0ba2bc..3653783fd 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -462,6 +462,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template +struct protate_impl +{ + static Packet4f run(const Packet4f& a) { + return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template +struct protate_impl +{ + static Packet4i run(const Packet4i& a) { + return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4); + } +}; + +template +struct protate_impl +{ + static Packet2d run(const Packet2d& a) { + return vec2d_swizzle1(a, offset, (offset + 1) % 2); + } +}; template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index c8a1dcced..6a16aa661 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -771,7 +771,19 @@ void gebp_kernel(&blB[(0+4*K)*RhsProgress]); \ + } else { \ + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ + B_0 = protate<1>(B_0); \ + } \ + } else { \ + traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \ + } \ + } while (false) + +#define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ @@ -827,34 +853,34 @@ void gebp_kernel resblock; \ + resblock.packet[0] = res0; \ + resblock.packet[1] = res1; \ + resblock.packet[2] = res2; \ + resblock.packet[3] = res3; \ + ptranspose(resblock); \ + resblock.packet[3] = protate<1>(resblock.packet[3]); \ + resblock.packet[2] = protate<2>(resblock.packet[2]); \ + resblock.packet[1] = protate<3>(resblock.packet[1]); \ + ptranspose(resblock); \ + res0 = resblock.packet[0]; \ + res1 = resblock.packet[1]; \ + res2 = resblock.packet[2]; \ + res3 = resblock.packet[3]; \ + } while (false) + + EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); + EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); + EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); + } + ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 7538a0633..5e16b775b 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -93,7 +93,8 @@ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH + STORAGE_LAYOUT_DOES_NOT_MATCH, + ROTATION_BY_ILLEGAL_OFFSET }; }; From 0ed00d543884e6cdd5a406789cb19f1b453db809 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 18 Feb 2015 15:05:01 -0500 Subject: [PATCH 16/18] remove a newly introduced redundant typedef - sorry. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 6a16aa661..15f3d869e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -83,9 +83,10 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff template void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { + typedef gebp_traits Traits; + #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES EIGEN_UNUSED_VARIABLE(num_threads); - typedef gebp_traits Traits; enum { kr = 16, mr = Traits::mr, @@ -99,6 +100,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads if (n > nr) n -= n % nr; return; #endif + // Explanations: // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed @@ -108,7 +110,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads manage_caching_sizes(GetAction, &l1, &l2, &l3); if (num_threads > 1) { - typedef gebp_traits Traits; typedef typename Traits::ResScalar ResScalar; enum { kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), From db05f2d01e6e5d8bcd67643ed2836028071d967f Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 18 Feb 2015 15:43:52 -0500 Subject: [PATCH 17/18] rotating kernel: avoid compiling anything outside of ARM --- .../Core/products/GeneralBlockPanelKernel.h | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 15f3d869e..ccd906540 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -773,18 +773,6 @@ void gebp_kernel(B_0); \ } \ } else { \ - traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0); \ + EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \ } \ } while (false) +#else +#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ + EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) +#endif #define EIGEN_GEBP_ONESTEP(K) \ do { \ @@ -897,8 +902,12 @@ void gebp_kernel(alpha); From 829dddd0fdb1bfd4942ff35fe7dfd4ccdeddc97e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 19 Feb 2015 15:18:37 +0100 Subject: [PATCH 18/18] Add support for C++11 result_of/lambdas --- Eigen/src/Core/util/Macros.h | 5 +++++ Eigen/src/Core/util/Meta.h | 32 ++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e607cdd12..aaea9f035 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -382,6 +382,11 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support result_of? +#if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)) +#define EIGEN_HAS_STD_RESULT_OF 1 +#endif + // Does the compiler support variadic templates? #if __cplusplus > 199711L #define EIGEN_HAS_VARIADIC_TEMPLATES 1 diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3be9e6ca5..674cd8f97 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -165,6 +165,7 @@ template struct result_of {}; struct has_none {int a[1];}; struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; +struct has_cxx_eleven_result {int a[4];}; template struct unary_result_of_select {typedef ArgType type;}; @@ -175,13 +176,22 @@ struct unary_result_of_select {typed template struct unary_result_of_select {typedef typename Func::template result::type type;}; +#ifdef EIGEN_HAS_STD_RESULT_OF +template +struct unary_result_of_select {typedef typename std::result_of::type type;}; +#endif + template struct result_of { template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); +#ifdef EIGEN_HAS_STD_RESULT_OF + template + static has_cxx_eleven_result testFunctor(T const *, typename std::result_of::type const * = 0); +#endif + static has_none testFunctor(...); // note that the following indirection is needed for gcc-3.3 enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; @@ -199,13 +209,23 @@ template struct binary_result_of_select {typedef typename Func::template result::type type;}; +#ifdef EIGEN_HAS_STD_RESULT_OF +template +struct binary_result_of_select +{typedef typename std::result_of::type type;}; +#endif + template struct result_of { template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); +#ifdef EIGEN_HAS_STD_RESULT_OF + template + static has_cxx_eleven_result testFunctor(T const *, typename std::result_of::type const * = 0); +#endif + static has_none testFunctor(...); // note that the following indirection is needed for gcc-3.3 enum {FunctorType = sizeof(testFunctor(static_cast(0)))};