From 4084dce038ea8b1011be6068b68e3aa10523ac07 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Thu, 26 Feb 2015 16:50:09 -0500 Subject: [PATCH 01/83] Added CMake support for Tensor module. CMake now installs CXX11 Tensor module like the rest of the unsupported modules --- unsupported/Eigen/CMakeLists.txt | 1 + unsupported/Eigen/CXX11/CMakeLists.txt | 8 ++++++++ unsupported/Eigen/CXX11/src/CMakeLists.txt | 3 +++ unsupported/Eigen/CXX11/src/Core/CMakeLists.txt | 1 + unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt | 6 ++++++ unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt | 6 ++++++ unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt | 8 ++++++++ .../Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt | 6 ++++++ 8 files changed, 39 insertions(+) create mode 100644 unsupported/Eigen/CXX11/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/Core/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt create mode 100644 unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt index e06f1238b..6faf4585d 100644 --- a/unsupported/Eigen/CMakeLists.txt +++ b/unsupported/Eigen/CMakeLists.txt @@ -9,3 +9,4 @@ install(FILES ) add_subdirectory(src) +add_subdirectory(CXX11) \ No newline at end of file diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt new file mode 100644 index 000000000..f1d9f0482 --- /dev/null +++ b/unsupported/Eigen/CXX11/CMakeLists.txt @@ -0,0 +1,8 @@ +set(Eigen_CXX11_HEADERS Core Tensor TensorSymmetry) + +install(FILES + ${Eigen_CXX11_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel + ) + +add_subdirectory(src) diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt new file mode 100644 index 000000000..d90ee1b0f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(Core) +add_subdirectory(Tensor) +add_subdirectory(TensorSymmetry) diff --git a/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt new file mode 100644 index 000000000..28571dcb9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(util) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt new file mode 100644 index 000000000..1e3b14712 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/util/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_Core_util_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_Core_util_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Core/util COMPONENT Devel + ) diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt new file mode 100644 index 000000000..6d4b3ea0d --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_Tensor_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel + ) diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt new file mode 100644 index 000000000..6e871a8da --- /dev/null +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt @@ -0,0 +1,8 @@ +FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_TensorSymmetry_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel + ) + +add_subdirectory(util) diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt new file mode 100644 index 000000000..dc9fc78ec --- /dev/null +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h") + +INSTALL(FILES + ${Eigen_CXX11_TensorSymmetry_util_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel + ) From b10cd3afd2ba85d0e8bca68bf0e716391e185837 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 27 Feb 2015 22:38:00 +0100 Subject: [PATCH 02/83] Re-enbale detection of min/max parentheses protection, and re-enable mpreal_support unit test. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- test/main.h | 10 ++++++++-- unsupported/test/CMakeLists.txt | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9061fd936..1f57b6de0 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -164,7 +164,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway // because we use the coefficient-based implementation for them. - if(std::max(k,std::max(m,n))<48) + if((std::max)(k,(std::max)(m,n))<48) return; typedef typename Traits::ResScalar ResScalar; diff --git a/test/main.h b/test/main.h index 1f937690c..ecf0c6924 100644 --- a/test/main.h +++ b/test/main.h @@ -42,13 +42,19 @@ #include #include #include +#if __cplusplus >= 201103L +#include +#ifdef EIGEN_USE_THREADS +#include +#endif +#endif // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -//#define min(A,B) please_protect_your_min_with_parentheses -//#define max(A,B) please_protect_your_max_with_parentheses +#define min(A,B) please_protect_your_min_with_parentheses +#define max(A,B) please_protect_your_max_with_parentheses #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 8542dd853..9eabfb620 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -50,7 +50,7 @@ if(MPFR_FOUND) include_directories(${MPFR_INCLUDES} ./mpreal) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) -# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) + ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() From 9aee1e300ad523d6bdc45a7592416ce96df9a82f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 27 Feb 2015 22:55:12 +0100 Subject: [PATCH 03/83] Increase unit-test L1 cache size to ensure we are doing at least 2 peeled loop within product kernel. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1f57b6de0..0fc7289be 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -155,7 +155,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads // In unit tests we do not want to use extra large matrices, // so we reduce the cache size to check the blocking strategy is not flawed #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - l1 = 4*1024; + l1 = 9*1024; l2 = 32*1024; l3 = 512*1024; #endif From 0f82a1d7b7a17302e8dceb370864ba42a6a13102 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 28 Feb 2015 14:52:26 +0100 Subject: [PATCH 04/83] bug #967: Automatically add cxx11 suffix when building in C++11 mode --- cmake/EigenTesting.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index b4ab95dbc..f8aec777d 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -502,6 +502,10 @@ macro(ei_set_build_string) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit) endif() + if(EIGEN_TEST_CXX11) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11) + endif() + if(EIGEN_BUILD_STRING_SUFFIX) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX}) endif() From 33f40b2883630b5ddec3bab5730691b40120faa6 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 28 Feb 2015 14:53:11 +0100 Subject: [PATCH 05/83] Cygwin does not like weak linking either. --- blas/xerbla.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp index 8775b88cd..c373e8699 100644 --- a/blas/xerbla.cpp +++ b/blas/xerbla.cpp @@ -1,7 +1,7 @@ #include -#if (defined __GNUC__) && (!defined __MINGW32__) +#if (defined __GNUC__) && (!defined __MINGW32__) && (!defined __CYGWIN__) #define EIGEN_WEAK_LINKING __attribute__ ((weak)) #else #define EIGEN_WEAK_LINKING From 682196e9fcdef7fb329fde833e30e91bb3f89077 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 28 Feb 2015 16:41:00 +0100 Subject: [PATCH 06/83] Fixed MPRealSupport --- unsupported/Eigen/MPRealSupport | 22 ++++++++++++++++------ unsupported/test/mpreal/mpreal.h | 3 ++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport index 8e42965a3..89036886b 100644 --- a/unsupported/Eigen/MPRealSupport +++ b/unsupported/Eigen/MPRealSupport @@ -141,20 +141,32 @@ int main() public: typedef mpfr::mpreal ResScalar; enum { + Vectorizable = false, + LhsPacketSize = 1, + RhsPacketSize = 1, + ResPacketSize = 1, + NumberOfRegisters = 1, nr = 1, mr = 1, LhsProgress = 1, RhsProgress = 1 }; + typedef ResScalar LhsPacket; + typedef ResScalar RhsPacket; + typedef ResScalar ResPacket; + }; - template - struct gebp_kernel + + + template + struct gebp_kernel { typedef mpfr::mpreal mpreal; EIGEN_DONT_INLINE - void operator()(mpreal* res, Index resStride, const mpreal* blockA, const mpreal* blockB, Index rows, Index depth, Index cols, mpreal alpha, + void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, + Index rows, Index depth, Index cols, const mpreal& alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0) { if(rows==0 || cols==0 || depth==0) @@ -170,8 +182,6 @@ int main() { for(Index j=0; j // Options -#define MPREAL_HAVE_INT64_SUPPORT // Enable int64_t support if possible. Available only for MSVC 2010 & GCC. +// FIXME HAVE_INT64_SUPPORT leads to clashes with long int and int64_t on some systems. +//#define MPREAL_HAVE_INT64_SUPPORT // Enable int64_t support if possible. Available only for MSVC 2010 & GCC. #define MPREAL_HAVE_MSVC_DEBUGVIEW // Enable Debugger Visualizer for "Debug" builds in MSVC. #define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS // Enable extended std::numeric_limits specialization. // Meaning that "digits", "round_style" and similar members are defined as functions, not constants. From 73dd95e7b03b4451ddf5cee17f4b44f2a7d6d91d Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 28 Feb 2015 16:51:53 +0100 Subject: [PATCH 07/83] Use @CMAKE_MAKE_PROGRAM@ instead of make in buildtests.sh --- scripts/buildtests.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/buildtests.in b/scripts/buildtests.in index 7026373cf..d2fd10276 100755 --- a/scripts/buildtests.in +++ b/scripts/buildtests.in @@ -14,9 +14,9 @@ targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo` if [ -n "${EIGEN_MAKE_ARGS:+x}" ] then - make $targets_to_make ${EIGEN_MAKE_ARGS} + @CMAKE_MAKE_PROGRAM@ $targets_to_make ${EIGEN_MAKE_ARGS} else - make $targets_to_make + @CMAKE_MAKE_PROGRAM@ $targets_to_make @EIGEN_TEST_BUILD_FLAGS@ fi exit $? From 31e2ffe82ce2851521db6f8280ff2e91a159d35e Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 28 Feb 2015 18:39:37 +0100 Subject: [PATCH 08/83] Replaced POSIX random() by internal::random --- unsupported/test/cxx11_tensor_comparisons.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp index 186f56ac3..b1ff8aecb 100644 --- a/unsupported/test/cxx11_tensor_comparisons.cpp +++ b/unsupported/test/cxx11_tensor_comparisons.cpp @@ -54,7 +54,7 @@ static void test_equality() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - if (random() < 0.5) { + if (internal::random()) { mat2(i,j,k) = mat1(i,j,k); } } From ef09ce4552daa4bee80e910f6a1d151e84240a2f Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Sat, 28 Feb 2015 21:30:31 -0500 Subject: [PATCH 09/83] Fix for TensorIO for Fixed sized Tensors. The following code snippet was failing to compile: TensorFixedSize > t_4x3; cout << 4x3; --- unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h index a9d0f6c39..bdc6ddb87 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -30,14 +30,14 @@ std::ostream& operator << (std::ostream& os, const TensorBase::type Scalar; typedef typename T::Index Index; typedef typename TensorEvaluator, DefaultDevice>::Dimensions Dimensions; - const Index total_size = internal::array_prod(tensor.dimensions()); + const Index total_size = tensor.dimensions().TotalSize(); // Print the tensor as a 1d vector or a 2d matrix. if (internal::array_size::value == 1) { Map > array(const_cast(tensor.data()), total_size); os << array; } else { - const Index first_dim = tensor.dimensions()[0]; + const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); static const int layout = TensorEvaluator, DefaultDevice>::Layout; Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); os << matrix; From 3109f0e74e9d4d4b16409cbd6de6dc297a6b9e05 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 2 Mar 2015 20:09:33 +0100 Subject: [PATCH 10/83] Add SSE vectorization of Quaternion::conjugate. Significant speed-up when combined with products like q1*q2.conjugate() --- Eigen/src/Geometry/Quaternion.h | 17 ++++++++-- Eigen/src/Geometry/arch/Geometry_SSE.h | 46 ++++++++++++++++++++------ 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index e1ad803bb..8c44df699 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -441,7 +441,7 @@ QuaternionBase::operator* (const QuaternionBase& other) c YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) return internal::quat_product::Scalar, - internal::traits::IsAligned && internal::traits::IsAligned>::run(*this, other); + (internal::traits::IsAligned && internal::traits::IsAligned)?Aligned:Unaligned>::run(*this, other); } /** \sa operator*(Quaternion) */ @@ -646,6 +646,16 @@ inline Quaternion::Scalar> QuaternionBase struct quat_conj +{ + static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& q){ + return Quaternion(q.w(),-q.x(),-q.y(),-q.z()); + } +}; +} + /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse * if the quaternion is normalized. * The conjugate of a quaternion represents the opposite rotation. @@ -656,7 +666,10 @@ template inline Quaternion::Scalar> QuaternionBase::conjugate() const { - return Quaternion(this->w(),-this->x(),-this->y(),-this->z()); + return internal::quat_conj::Scalar, + internal::traits::IsAligned?Aligned:Unaligned>::run(*this); + } /** \returns the angle (in radian) between two rotations diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index 3d8284f2d..e59c32c56 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -20,23 +20,35 @@ struct quat_product { static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000)); Quaternion res; + const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); __m128 a = _a.coeffs().template packet(0); __m128 b = _b.coeffs().template packet(0); - __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2), - vec4f_swizzle1(b,2,0,1,2)),mask); - __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1), - vec4f_swizzle1(b,0,1,2,1)),mask); + __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); pstore(&res.x(), _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), - _mm_add_ps(flip1,flip2))); + _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); + return res; } }; +template +struct quat_conj +{ + static inline Quaternion run(const QuaternionBase& q) + { + Quaternion res; + const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); + pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet(0))); + return res; + } +}; + + template struct cross3_impl { @@ -56,8 +68,8 @@ struct cross3_impl -template -struct quat_product +template +struct quat_product { static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { @@ -66,8 +78,8 @@ struct quat_product Quaternion res; const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); + Packet2d b_xy = _b.coeffs().template packet(0); + Packet2d b_zw = _b.coeffs().template packet(2); Packet2d a_xx = pset1(a[0]); Packet2d a_yy = pset1(a[1]); Packet2d a_zz = pset1(a[2]); @@ -108,6 +120,20 @@ struct quat_product } }; +template +struct quat_conj +{ + static inline Quaternion run(const QuaternionBase& q) + { + Quaternion res; + const __m128d mask0 = _mm_setr_pd(-0.,-0.); + const __m128d mask2 = _mm_setr_pd(-0.,0.); + pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet(0))); + pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet(2))); + return res; + } +}; + } // end namespace internal } // end namespace Eigen From 1ec0f4fadf1b8d95fb1506e87112d0c7888afd95 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 2 Mar 2015 16:08:54 -0500 Subject: [PATCH 11/83] HalfPacket also needed to be disabled for double, on ARMv8. --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 8dd1e1370..d0544bdf1 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -541,12 +541,12 @@ typedef float64x1_t Packet1d; template<> struct packet_traits : default_packet_traits { typedef Packet2d type; - typedef Packet1d half; + typedef Packet2d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket=1, + HasHalfPacket=0, HasDiv = 1, // FIXME check the Has* From 9930e9583b336edda281a4490cfe69c53082318e Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 2 Mar 2015 18:08:38 -0500 Subject: [PATCH 12/83] Improve analyze-blocking-sizes, and in particular give it a evaluate-defaults tool that shows the efficiency of Eigen's default blocking sizes choices, using a previously computed table from benchmark-blocking-sizes. --- bench/analyze-blocking-sizes.cpp | 232 +++++++++++++++++++++++++------ 1 file changed, 186 insertions(+), 46 deletions(-) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index a603c0216..f507eab38 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -7,6 +7,10 @@ #include #include #include +#include +#include + +#include using namespace std; @@ -88,6 +92,11 @@ struct preprocessed_inputfile_entry_t float efficiency; }; +bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2) +{ + return e1.efficiency < e2.efficiency; +} + struct preprocessed_inputfile_t { string filename; @@ -394,17 +403,187 @@ void print_partition( cout << endl; } +struct action_t +{ + virtual const char* invokation_name() const { abort(); return nullptr; } + virtual void run(const vector& preprocessed_inputfiles) const { abort(); } + virtual ~action_t() {} +}; + +struct partition_action_t : action_t +{ + virtual const char* invokation_name() const { return "partition"; } + virtual void run(const vector& preprocessed_inputfiles) const + { + check_all_files_in_same_exact_order(preprocessed_inputfiles); + + float required_efficiency_to_beat = 0.0f; + vector>> partitions; + cerr << "searching for partitions...\r" << flush; + while (true) + { + vector> partition; + find_partition_with_efficiency_higher_than( + preprocessed_inputfiles, + required_efficiency_to_beat, + partition); + float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition); + cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size() + << " subsets for " << 100.0f * actual_efficiency + << " % efficiency" + << " \r" << flush; + partitions.push_back(partition); + if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) { + break; + } + required_efficiency_to_beat = actual_efficiency; + } + cerr << " " << endl; + while (true) { + bool repeat = false; + for (size_t i = 0; i < partitions.size() - 1; i++) { + if (partitions[i].size() >= partitions[i+1].size()) { + partitions.erase(partitions.begin() + i); + repeat = true; + break; + } + } + if (!repeat) { + break; + } + } + for (auto it = partitions.begin(); it != partitions.end(); ++it) { + print_partition(preprocessed_inputfiles, *it); + } + } +}; + +uint8_t log2_pot(size_t x) { + size_t l = 0; + while (x >>= 1) l++; + return l; +} + +uint16_t compact_size_triple(size_t k, size_t m, size_t n) +{ + return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); +} + +// just a helper to store a triple of K,M,N sizes for matrix product +struct size_triple_t +{ + size_t k, m, n; + size_triple_t() : k(0), m(0), n(0) {} + size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} + size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} + size_triple_t(uint16_t compact) + { + k = 1 << ((compact & 0xf00) >> 8); + m = 1 << ((compact & 0x0f0) >> 4); + n = 1 << ((compact & 0x00f) >> 0); + } + bool is_cubic() const { return k == m && m == n; } +}; + +struct evaluate_defaults_action_t : action_t +{ + virtual const char* invokation_name() const { return "evaluate-defaults"; } + virtual void run(const vector& preprocessed_inputfiles) const + { + if (preprocessed_inputfiles.size() > 1) { + cerr << invokation_name() << " only works with one input file." << endl; + exit(1); + } + + const preprocessed_inputfile_t& preprocessed_inputfile = preprocessed_inputfiles.front(); + + uint16_t product_size = 0; + uint16_t default_block_size = 0; + vector results, cubic_results; + for (auto it = preprocessed_inputfile.entries.begin(); it != preprocessed_inputfile.entries.end(); ++it) { + if (it->product_size != product_size) { + product_size = it->product_size; + size_triple_t product_size_triple(product_size); + Eigen::Index k = product_size_triple.k, + m = product_size_triple.m, + n = product_size_triple.n; + Eigen::internal::computeProductBlockingSizes(k, m, n); + default_block_size = compact_size_triple(k, m, n); + } + if (it->block_size == default_block_size) { + results.push_back(*it); + if (size_triple_t(product_size).is_cubic()) { + cubic_results.push_back(*it); + } + } + } + + cerr << "Below are all results - first column is product size tripe, " << endl + << "second column is block size triple, in hex kmn form where" << endl + << "k, m, n are the log2 of the actual values, i.e. a89 means" << endl + << "k=1024, m=256, n=512." << endl << endl; + for (auto it = results.begin(); it != results.end(); ++it) { + cerr << hex << it->product_size << " " << it->block_size + << " efficiency " << std::dec << 100.0f * it->efficiency << " %" << endl; + } + cerr << endl; + sort(results.begin(), results.end(), lower_efficiency); + sort(cubic_results.begin(), cubic_results.end(), lower_efficiency); + cerr << "Efficiency summary: min = " + << 100.0f * results.front().efficiency << " %, max = " + << 100.0f * results.back().efficiency << " %, median = " + << 100.0f * results[results.size() / 2].efficiency << " %" << endl; + cerr << "20% of product sizes have efficiency <= " << 100.0f * results[results.size() * 20 / 100].efficiency << " %" << endl; + cerr << "10% of product sizes have efficiency <= " << 100.0f * results[results.size() * 10 / 100].efficiency << " %" << endl; + cerr << "5% of product sizes have efficiency <= " << 100.0f * results[results.size() * 5 / 100].efficiency << " %" << endl; + cerr << "Cubic sizes efficiency summary: min = " + << 100.0f * cubic_results.front().efficiency << " %, max = " + << 100.0f * cubic_results.back().efficiency << " %, median = " + << 100.0f * cubic_results[cubic_results.size() / 2].efficiency << " %" << endl; + + } +}; + + +void show_usage_and_exit(int argc, char* argv[], + const vector>& available_actions) +{ + cerr << "usage: " << argv[0] << " " << endl; + cerr << "available actions:" << endl; + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + cerr << " " << (*it)->invokation_name() << endl; + } + cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl; + exit(1); +} + int main(int argc, char* argv[]) { - if (argc == 1) { - cerr << "usage: " << argv[0] << " [input files]" << endl; - cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl; - exit(1); - } cout.precision(3); cerr.precision(3); + + vector> available_actions; + available_actions.emplace_back(new partition_action_t); + available_actions.emplace_back(new evaluate_defaults_action_t); + + auto action = available_actions.end(); + + if (argc <= 2) { + show_usage_and_exit(argc, argv, available_actions); + } + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[1], (*it)->invokation_name())) { + action = it; + break; + } + } + + if (action == available_actions.end()) { + show_usage_and_exit(argc, argv, available_actions); + } + vector inputfilenames; - for (int i = 1; i < argc; i++) { + for (int i = 2; i < argc; i++) { inputfilenames.emplace_back(argv[i]); } @@ -413,44 +592,5 @@ int main(int argc, char* argv[]) preprocessed_inputfiles.emplace_back(inputfile_t(*it)); } - check_all_files_in_same_exact_order(preprocessed_inputfiles); - - float required_efficiency_to_beat = 0.0f; - vector>> partitions; - cerr << "searching for partitions...\r" << flush; - while (true) - { - vector> partition; - find_partition_with_efficiency_higher_than( - preprocessed_inputfiles, - required_efficiency_to_beat, - partition); - float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition); - cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size() - << " subsets for " << 100.0f * actual_efficiency - << " % efficiency" - << " \r" << flush; - partitions.push_back(partition); - if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) { - break; - } - required_efficiency_to_beat = actual_efficiency; - } - cerr << " " << endl; - while (true) { - bool repeat = false; - for (size_t i = 0; i < partitions.size() - 1; i++) { - if (partitions[i].size() >= partitions[i+1].size()) { - partitions.erase(partitions.begin() + i); - repeat = true; - break; - } - } - if (!repeat) { - break; - } - } - for (auto it = partitions.begin(); it != partitions.end(); ++it) { - print_partition(preprocessed_inputfiles, *it); - } + (*action)->run(preprocessed_inputfiles); } From f8390995127f9f73f2376c43f93eaa27bbad3675 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 3 Mar 2015 09:35:22 -0500 Subject: [PATCH 13/83] Work around an ICE in Clang 3.5 in the iOS toolchain with double NEON intrinsics. --- Eigen/src/Core/arch/NEON/PacketMath.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d0544bdf1..ce0abfd80 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -518,7 +518,19 @@ ptranspose(PacketBlock& kernel) { } //---------- double ---------- -#if EIGEN_ARCH_ARM64 + +// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. +// Confirmed at least with __apple_build_version__ = 6000054. +#ifdef __apple_build_version__ +// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. +// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with +// major toolchain updates. +#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) +#else +#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 +#endif + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG #if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) // Bug 907: workaround missing declarations of the following two functions in the ADK From ccc1277a42f254b184b750292a7cd672a58bbc63 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 3 Mar 2015 10:17:05 -0500 Subject: [PATCH 14/83] must also disable complex when disabling double vectorization --- Eigen/src/Core/arch/NEON/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 154daa7a7..c7fb12fe8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -272,7 +272,7 @@ ptranspose(PacketBlock& kernel) { } //---------- double ---------- -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000); From 37a93c4263324011242941cff87d444e9c465422 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 3 Mar 2015 17:08:28 +0100 Subject: [PATCH 15/83] New scoring functor to select the pivot. This is can be useful for non-floating point scalars, where choosing the biggest element is generally not the best choice. --- Eigen/src/Core/functors/UnaryFunctors.h | 28 ++++++++++++ Eigen/src/LU/FullPivLU.h | 11 +++-- Eigen/src/LU/PartialPivLU.h | 8 ++-- Eigen/src/QR/FullPivHouseholderQR.h | 10 ++-- doc/CustomizingEigen.dox | 61 +++++++++++++++++++++++++ 5 files changed, 107 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index ec42e6850..f32f0f113 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -55,6 +55,34 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the score of a scalar, to chose a pivot + * + * \sa class CwiseUnaryOp + */ +template struct scalar_score_coeff_op : scalar_abs_op +{ + typedef void Score_is_abs; +}; +template +struct functor_traits > : functor_traits > {}; + +/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor. */ +template struct abs_knowing_score +{ + EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) + typedef typename NumTraits::Real result_type; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); } +}; +template struct abs_knowing_score::Score_is_abs> +{ + EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) + typedef typename NumTraits::Real result_type; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; } +}; + /** \internal * \brief Template functor to compute the squared absolute value of a scalar * diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 49c8b183d..d1a260a37 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -459,14 +459,16 @@ FullPivLU& FullPivLU::compute(const MatrixType& matrix) // biggest coefficient in the remaining bottom-right corner (starting at row k, col k) Index row_of_biggest_in_corner, col_of_biggest_in_corner; - RealScalar biggest_in_corner; + typedef internal::scalar_score_coeff_op Scoring; + typedef typename Scoring::result_type Score; + Score biggest_in_corner; biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k) - .cwiseAbs() + .unaryExpr(Scoring()) .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner); row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner, col_of_biggest_in_corner += k; // need to add k to them. - if(biggest_in_corner==RealScalar(0)) + if(biggest_in_corner==Score(0)) { // before exiting, make sure to initialize the still uninitialized transpositions // in a sane state without destroying what we already have. @@ -479,7 +481,8 @@ FullPivLU& FullPivLU::compute(const MatrixType& matrix) break; } - if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner; + RealScalar abs_pivot = internal::abs_knowing_score()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner); + if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot; // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index e57b36bc5..3d8825a97 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -275,6 +275,8 @@ struct partial_lu_impl */ static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { + typedef scalar_score_coeff_op Scoring; + typedef typename Scoring::result_type Score; const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); @@ -286,13 +288,13 @@ struct partial_lu_impl Index rcols = cols-k-1; Index row_of_biggest_in_col; - RealScalar biggest_in_corner - = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col); + Score biggest_in_corner + = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col); row_of_biggest_in_col += k; row_transpositions[k] = PivIndex(row_of_biggest_in_col); - if(biggest_in_corner != RealScalar(0)) + if(biggest_in_corner != Score(0)) { if(k != row_of_biggest_in_col) { diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index 7d5e58d2f..4952fbb46 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -443,13 +443,15 @@ FullPivHouseholderQR& FullPivHouseholderQR::compute(cons for (Index k = 0; k < size; ++k) { Index row_of_biggest_in_corner, col_of_biggest_in_corner; - RealScalar biggest_in_corner; + typedef internal::scalar_score_coeff_op Scoring; + typedef typename Scoring::result_type Score; - biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k) - .cwiseAbs() - .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner); + Score score = m_qr.bottomRightCorner(rows-k, cols-k) + .unaryExpr(Scoring()) + .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner); row_of_biggest_in_corner += k; col_of_biggest_in_corner += k; + RealScalar biggest_in_corner = internal::abs_knowing_score()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score); if(k==0) biggest = biggest_in_corner; // if the corner is negligible, then we have less than full rank, and we can finish early diff --git a/doc/CustomizingEigen.dox b/doc/CustomizingEigen.dox index 0850863aa..cb25f4ec9 100644 --- a/doc/CustomizingEigen.dox +++ b/doc/CustomizingEigen.dox @@ -157,6 +157,67 @@ inline adouble abs2(const adouble& x) { return x*x; } #endif // ADOLCSUPPORT_H \endcode +This other example adds support for the \c mpq_class type from GMP. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types. + +\code +#include +#include +#include + +namespace Eigen { + template struct NumTraits; + template<> struct NumTraits + { + typedef mpq_class Real; + typedef mpq_class NonInteger; + typedef mpq_class Nested; + + static inline Real epsilon() { return 0; } + static inline Real dummy_precision() { return 0; } + + enum { + IsInteger = 0, + IsSigned = 1, + IsComplex = 0, + RequireInitialization = 1, + ReadCost = 6, + AddCost = 150, + MulCost = 100 + }; + }; + + namespace internal { + template<> + struct significant_decimals_impl + { + // Infinite precision when printing + static inline int run() { return 0; } + }; + + template<> struct scalar_score_coeff_op { + struct result_type : boost::totally_ordered1 { + std::size_t len; + result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score() + result_type(mpq_class const& q) : + len(mpz_size(q.get_num_mpz_t())+ + mpz_size(q.get_den_mpz_t())-1) {} + friend bool operator<(result_type x, result_type y) { + // 0 is the worst possible pivot + if (x.len == 0) return y.len > 0; + if (y.len == 0) return false; + // Prefer a pivot with a small representation + return x.len > y.len; + } + friend bool operator==(result_type x, result_type y) { + // Only used to test if the score is 0 + return x.len == y.len; + } + }; + result_type operator()(mpq_class const& x) const { return x; } + }; + } +} +\endcode \sa \ref TopicPreprocessorDirectives From eae8e27b7d92c31fe07d18f0b6d0feded67f7096 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 3 Mar 2015 11:41:21 -0500 Subject: [PATCH 16/83] Add a benchmark-default-sizes action to benchmark-blocking-sizes.cpp --- .../Core/products/GeneralBlockPanelKernel.h | 28 +- bench/analyze-blocking-sizes.cpp | 4 +- bench/benchmark-blocking-sizes.cpp | 249 ++++++++++++------ 3 files changed, 191 insertions(+), 90 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0fc7289be..fe4ead384 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -86,19 +86,21 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads typedef gebp_traits Traits; #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - EIGEN_UNUSED_VARIABLE(num_threads); - enum { - kr = 8, - mr = Traits::mr, - nr = Traits::nr - }; - k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); - if (k > kr) k -= k % kr; - m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); - if (m > mr) m -= m % mr; - n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); - if (n > nr) n -= n % nr; - return; + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + EIGEN_UNUSED_VARIABLE(num_threads); + enum { + kr = 8, + mr = Traits::mr, + nr = Traits::nr + }; + k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + if (k > kr) k -= k % kr; + m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + if (m > mr) m -= m % mr; + n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + if (n > nr) n -= n % nr; + return; + } #endif // Explanations: diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index f507eab38..d25d72d2f 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -559,8 +559,8 @@ void show_usage_and_exit(int argc, char* argv[], int main(int argc, char* argv[]) { - cout.precision(3); - cerr.precision(3); + cout.precision(4); + cerr.precision(4); vector> available_actions; available_actions.emplace_back(new partition_action_t); diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 04244575a..6d688c0ac 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -3,9 +3,11 @@ #include #include #include +#include +bool eigen_use_specific_block_size; int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; -#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES +#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n @@ -82,16 +84,25 @@ struct benchmark_t { uint16_t compact_product_size; uint16_t compact_block_size; + bool use_default_block_size; float gflops; benchmark_t() : compact_product_size(0) , compact_block_size(0) , gflops(0) + , use_default_block_size(false) {} benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(compact_size_triple(bk, bm, bn)) + , use_default_block_size(false) + , gflops(0) + {} + benchmark_t(size_t pk, size_t pm, size_t pn) + : compact_product_size(compact_size_triple(pk, pm, pn)) + , compact_block_size(0) + , use_default_block_size(true) , gflops(0) {} @@ -100,10 +111,12 @@ struct benchmark_t ostream& operator<<(ostream& s, const benchmark_t& b) { - s << hex; - s << b.compact_product_size - << " " << b.compact_block_size; - s << dec; + s << hex << b.compact_product_size << dec; + if (b.use_default_block_size) { + s << " default"; + } else { + s << " " << hex << b.compact_block_size << dec; + } s << " " << b.gflops; return s; } @@ -121,14 +134,18 @@ bool operator<(const benchmark_t& b1, const benchmark_t& b2) void benchmark_t::run() { - // expand our compact benchmark params into proper triples size_triple_t productsizes(compact_product_size); - size_triple_t blocksizes(compact_block_size); - - // feed eigen with our custom blocking params - eigen_block_size_k = blocksizes.k; - eigen_block_size_m = blocksizes.m; - eigen_block_size_n = blocksizes.n; + + if (use_default_block_size) { + eigen_use_specific_block_size = false; + } else { + // feed eigen with our custom blocking params + eigen_use_specific_block_size = true; + size_triple_t blocksizes(compact_block_size); + eigen_block_size_k = blocksizes.k; + eigen_block_size_m = blocksizes.m; + eigen_block_size_n = blocksizes.n; + } // set up the matrix pool @@ -231,9 +248,23 @@ string type_name() return "double"; } -void show_usage_and_exit(const char *progname) +struct action_t { - cerr << "usage: " << progname << " [--min-working-set-size=N]" << endl << endl; + virtual const char* invokation_name() const { abort(); return nullptr; } + virtual void run() const { abort(); } + virtual ~action_t() {} +}; + +void show_usage_and_exit(int argc, char* argv[], + const vector>& available_actions) +{ + cerr << "usage: " << argv[0] << " [options...]" << endl << endl; + cerr << "available actions:" << endl << endl; + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + cerr << " " << (*it)->invokation_name() << endl; + } + cerr << endl; + cerr << "options:" << endl << endl; cerr << " --min-working-set-size=N:" << endl; cerr << " Set the minimum working set size to N bytes." << endl; cerr << " This is rounded up as needed to a multiple of matrix size." << endl; @@ -245,56 +276,8 @@ void show_usage_and_exit(const char *progname) exit(1); } -int main(int argc, char* argv[]) +void run_benchmarks(vector& benchmarks) { - for (int i = 1; i < argc; i++) { - if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { - const char* equals_sign = strchr(argv[i], '='); - min_working_set_size = strtoul(equals_sign+1, nullptr, 10); - } else { - cerr << "unrecognized option: " << argv[i] << endl << endl; - show_usage_and_exit(argv[0]); - } - } - - cout.precision(4); - - print_cpuinfo(); - - cout << "benchmark parameters:" << endl; - cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; - cout << "scalar type: " << type_name() << endl; - cout << "packet size: " << internal::packet_traits::size << endl; - cout << "minsize = " << minsize << endl; - cout << "maxsize = " << maxsize << endl; - cout << "measurement_repetitions = " << measurement_repetitions << endl; - cout << "min_accurate_time = " << min_accurate_time << endl; - cout << "min_working_set_size = " << min_working_set_size; - if (min_working_set_size == 0) { - cout << " (try to outsize caches)"; - } - cout << endl << endl; - - - // assemble the array of benchmarks without running them at first - vector benchmarks; - for (int repetition = 0; repetition < measurement_repetitions; repetition++) { - for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { - for (size_t msize = minsize; msize <= maxsize; msize *= 2) { - for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { - for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { - for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { - for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { - benchmark_t b(ksize, msize, nsize, kblock, mblock, nblock); - benchmarks.push_back(b); - } - } - } - } - } - } - } - // randomly shuffling benchmarks allows us to get accurate enough progress info, // as now the cheap/expensive benchmarks are randomly mixed so they average out. random_shuffle(benchmarks.begin(), benchmarks.end()); @@ -315,14 +298,23 @@ int main(int argc, char* argv[]) if (i > 10) { cerr << ", ETA "; - float eta = float(time_now - time_start) * (1.0f - ratio_done) / ratio_done; - if (eta > 3600) - cerr << eta/3600 << " hours"; - else if (eta > 60) - cerr << eta/60 << " minutes"; - else cerr << eta << " seconds"; + int eta = int(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done); + int eta_remainder = eta; + if (eta_remainder > 3600) { + int hours = eta_remainder / 3600; + cerr << hours << " h "; + eta_remainder -= hours * 3600; + } + if (eta_remainder > 60) { + int minutes = eta_remainder / 60; + cerr << minutes << " min "; + eta_remainder -= minutes * 60; + } + if (eta < 600 && eta_remainder) { + cerr << eta_remainder << " s"; + } } - cerr << " \r" << flush; + cerr << " \r" << flush; } // This is where we actually run a benchmark! @@ -348,9 +340,116 @@ int main(int argc, char* argv[]) } } - // Output data. - cout << "BEGIN MEASUREMENTS" << endl; - for (auto it = best_benchmarks.begin(); it != best_benchmarks.end(); ++it) { - cout << *it << endl; - } + // keep and return only the best benchmarks + benchmarks = best_benchmarks; +} + +struct measure_all_pot_sizes_action_t : action_t +{ + virtual const char* invokation_name() const { return "measure-all-pot-sizes"; } + virtual void run() const + { + vector benchmarks; + for (int repetition = 0; repetition < measurement_repetitions; repetition++) { + for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { + for (size_t msize = minsize; msize <= maxsize; msize *= 2) { + for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { + for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { + for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { + for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { + benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock); + } + } + } + } + } + } + } + + run_benchmarks(benchmarks); + + cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + cout << *it << endl; + } + } +}; + +struct measure_default_sizes_action_t : action_t +{ + virtual const char* invokation_name() const { return "measure-default-sizes"; } + virtual void run() const + { + vector benchmarks; + for (int repetition = 0; repetition < measurement_repetitions; repetition++) { + for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { + for (size_t msize = minsize; msize <= maxsize; msize *= 2) { + for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { + benchmarks.emplace_back(ksize, msize, nsize); + } + } + } + } + + run_benchmarks(benchmarks); + + cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl; + for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { + cout << *it << endl; + } + } +}; + +int main(int argc, char* argv[]) +{ + cout.precision(4); + cerr.precision(4); + + vector> available_actions; + available_actions.emplace_back(new measure_all_pot_sizes_action_t); + available_actions.emplace_back(new measure_default_sizes_action_t); + + auto action = available_actions.end(); + + if (argc <= 1) { + show_usage_and_exit(argc, argv, available_actions); + } + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[1], (*it)->invokation_name())) { + action = it; + break; + } + } + + if (action == available_actions.end()) { + show_usage_and_exit(argc, argv, available_actions); + } + + for (int i = 2; i < argc; i++) { + if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { + const char* equals_sign = strchr(argv[i], '='); + min_working_set_size = strtoul(equals_sign+1, nullptr, 10); + } else { + cerr << "unrecognized option: " << argv[i] << endl << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + + print_cpuinfo(); + + cout << "benchmark parameters:" << endl; + cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; + cout << "scalar type: " << type_name() << endl; + cout << "packet size: " << internal::packet_traits::size << endl; + cout << "minsize = " << minsize << endl; + cout << "maxsize = " << maxsize << endl; + cout << "measurement_repetitions = " << measurement_repetitions << endl; + cout << "min_accurate_time = " << min_accurate_time << endl; + cout << "min_working_set_size = " << min_working_set_size; + if (min_working_set_size == 0) { + cout << " (try to outsize caches)"; + } + cout << endl << endl; + + (*action)->run(); } From f64b4480afba333dca8ed628bd73f9a1cddf3dd1 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 3 Mar 2015 11:43:56 -0500 Subject: [PATCH 17/83] Add missing copyright notices --- bench/analyze-blocking-sizes.cpp | 9 +++++++++ bench/benchmark-blocking-sizes.cpp | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index d25d72d2f..3316f8cbf 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -1,3 +1,12 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + #include #include #include diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 6d688c0ac..f1fef1ede 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -1,3 +1,12 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + #include #include #include From 5d2fd64a1a060a388e6681c3556bb4dc9482943e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 3 Mar 2015 08:56:49 -0800 Subject: [PATCH 18/83] Fixed compilation error when compiling with gcc4.7 --- unsupported/test/cxx11_tensor_expr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 695565e9b..8389e9840 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -260,7 +260,7 @@ static void test_type_casting() mat1.setRandom(); mat2.setRandom(); - mat3 = mat1.template cast(); + mat3 = mat1.cast(); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -269,7 +269,7 @@ static void test_type_casting() } } - mat3 = mat2.template cast(); + mat3 = mat2.cast(); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { From 2aa09e6b4ecd28b871c0e48cc7d6dd800bf5de47 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 3 Mar 2015 13:44:00 -0500 Subject: [PATCH 19/83] Fix asm comments in 1px1 kernel --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index fe4ead384..df9f66995 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1481,17 +1481,17 @@ void gebp_kernel Date: Wed, 4 Mar 2015 09:34:27 +0100 Subject: [PATCH 20/83] Add a CG-based solver for rectangular least-square problems (bug #975). --- Eigen/IterativeLinearSolvers | 10 +- .../BasicPreconditioners.h | 70 +++++- .../ConjugateGradient.h | 44 ++-- .../LeastSquareConjugateGradient.h | 213 ++++++++++++++++++ test/CMakeLists.txt | 1 + test/lscg.cpp | 29 +++ test/sparse_solver.h | 58 ++++- 7 files changed, 392 insertions(+), 33 deletions(-) create mode 100644 Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h create mode 100644 test/lscg.cpp diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers index c06668bd2..0594feb41 100644 --- a/Eigen/IterativeLinearSolvers +++ b/Eigen/IterativeLinearSolvers @@ -12,24 +12,26 @@ * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse. * Those solvers are accessible via the following classes: * - ConjugateGradient for selfadjoint (hermitian) matrices, + * - LSCG for rectangular least-square problems, * - BiCGSTAB for general square matrices. * * These iterative solvers are associated with some preconditioners: * - IdentityPreconditioner - not really useful * - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices. - * - IncompleteILUT - incomplete LU factorization with dual thresholding + * - IncompleteLUT - incomplete LU factorization with dual thresholding * * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport. * - * \code - * #include - * \endcode + \code + #include + \endcode */ #include "src/IterativeLinearSolvers/SolveWithGuess.h" #include "src/IterativeLinearSolvers/IterativeSolverBase.h" #include "src/IterativeLinearSolvers/BasicPreconditioners.h" #include "src/IterativeLinearSolvers/ConjugateGradient.h" +#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h" #include "src/IterativeLinearSolvers/BiCGSTAB.h" #include "src/IterativeLinearSolvers/IncompleteLUT.h" diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index a09f81225..6da423cf6 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -17,9 +17,9 @@ namespace Eigen { * * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix. * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for: - * \code - * A.diagonal().asDiagonal() . x = b - * \endcode + \code + A.diagonal().asDiagonal() . x = b + \endcode * * \tparam _Scalar the type of the scalar. * @@ -28,6 +28,7 @@ namespace Eigen { * * \note A variant that has yet to be implemented would attempt to preserve the norm of each column. * + * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient */ template class DiagonalPreconditioner @@ -100,6 +101,69 @@ class DiagonalPreconditioner bool m_isInitialized; }; +/** \ingroup IterativeLinearSolvers_Module + * \brief Jacobi preconditioner for LSCG + * + * This class allows to approximately solve for A' A x = A' b problems assuming A' A is a diagonal matrix. + * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for: + \code + (A.adjoint() * A).diagonal().asDiagonal() * x = b + \endcode + * + * \tparam _Scalar the type of the scalar. + * + * The diagonal entries are pre-inverted and stored into a dense vector. + * + * \sa class LSCG, class DiagonalPreconditioner + */ +template +class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> +{ + typedef _Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef DiagonalPreconditioner<_Scalar> Base; + using Base::m_invdiag; + public: + + LeastSquareDiagonalPreconditioner() : Base() {} + + template + explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base() + { + compute(mat); + } + + template + LeastSquareDiagonalPreconditioner& analyzePattern(const MatType& ) + { + return *this; + } + + template + LeastSquareDiagonalPreconditioner& factorize(const MatType& mat) + { + // Compute the inverse squared-norm of each column of mat + m_invdiag.resize(mat.cols()); + for(Index j=0; j0) + m_invdiag(j) = RealScalar(1)/sum; + else + m_invdiag(j) = RealScalar(1); + } + Base::m_isInitialized = true; + return *this; + } + + template + LeastSquareDiagonalPreconditioner& compute(const MatType& mat) + { + return factorize(mat); + } + + protected: +}; /** \ingroup IterativeLinearSolvers_Module * \brief A naive preconditioner which approximates any matrix as the identity matrix diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index a799c3ef5..10cd94783 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -60,29 +60,29 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, } VectorType p(n); - p = precond.solve(residual); //initial search direction + p = precond.solve(residual); // initial search direction VectorType z(n), tmp(n); RealScalar absNew = numext::real(residual.dot(p)); // the square of the absolute value of r scaled by invM Index i = 0; while(i < maxIters) { - tmp.noalias() = mat * p; // the bottleneck of the algorithm + tmp.noalias() = mat * p; // the bottleneck of the algorithm - Scalar alpha = absNew / p.dot(tmp); // the amount we travel on dir - x += alpha * p; // update solution - residual -= alpha * tmp; // update residue + Scalar alpha = absNew / p.dot(tmp); // the amount we travel on dir + x += alpha * p; // update solution + residual -= alpha * tmp; // update residual residualNorm2 = residual.squaredNorm(); if(residualNorm2 < threshold) break; - z = precond.solve(residual); // approximately solve for "A z = residual" + z = precond.solve(residual); // approximately solve for "A z = residual" RealScalar absOld = absNew; absNew = numext::real(residual.dot(z)); // update the absolute value of r - RealScalar beta = absNew / absOld; // calculate the Gram-Schmidt value used to create the new search direction - p = z + beta * p; // update search direction + RealScalar beta = absNew / absOld; // calculate the Gram-Schmidt value used to create the new search direction + p = z + beta * p; // update search direction i++; } tol_error = sqrt(residualNorm2 / rhsNorm2); @@ -122,24 +122,24 @@ struct traits > * and NumTraits::epsilon() for the tolerance. * * This class can be used as the direct solver classes. Here is a typical usage example: - * \code - * int n = 10000; - * VectorXd x(n), b(n); - * SparseMatrix A(n,n); - * // fill A and b - * ConjugateGradient > cg; - * cg.compute(A); - * x = cg.solve(b); - * std::cout << "#iterations: " << cg.iterations() << std::endl; - * std::cout << "estimated error: " << cg.error() << std::endl; - * // update b, and solve again - * x = cg.solve(b); - * \endcode + \code + int n = 10000; + VectorXd x(n), b(n); + SparseMatrix A(n,n); + // fill A and b + ConjugateGradient > cg; + cg.compute(A); + x = cg.solve(b); + std::cout << "#iterations: " << cg.iterations() << std::endl; + std::cout << "estimated error: " << cg.error() << std::endl; + // update b, and solve again + x = cg.solve(b); + \endcode * * By default the iterations start with x=0 as an initial guess of the solution. * One can control the start using the solveWithGuess() method. * - * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner + * \sa class LSCG, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ template< typename _MatrixType, int _UpLo, typename _Preconditioner> class ConjugateGradient : public IterativeSolverBase > diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h new file mode 100644 index 000000000..beaf5c307 --- /dev/null +++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h @@ -0,0 +1,213 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H +#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H + +namespace Eigen { + +namespace internal { + +/** \internal Low-level conjugate gradient algorithm for least-square problems + * \param mat The matrix A + * \param rhs The right hand side vector b + * \param x On input and initial solution, on output the computed solution. + * \param precond A preconditioner being able to efficiently solve for an + * approximation of A'Ax=b (regardless of b) + * \param iters On input the max number of iteration, on output the number of performed iterations. + * \param tol_error On input the tolerance error, on output an estimation of the relative error. + */ +template +EIGEN_DONT_INLINE +void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, + const Preconditioner& precond, Index& iters, + typename Dest::RealScalar& tol_error) +{ + using std::sqrt; + using std::abs; + typedef typename Dest::RealScalar RealScalar; + typedef typename Dest::Scalar Scalar; + typedef Matrix VectorType; + + RealScalar tol = tol_error; + Index maxIters = iters; + + Index m = mat.rows(), n = mat.cols(); + + VectorType residual = rhs - mat * x; + VectorType normal_residual = mat.adjoint() * residual; + + RealScalar rhsNorm2 = (mat.adjoint()*rhs).squaredNorm(); + if(rhsNorm2 == 0) + { + x.setZero(); + iters = 0; + tol_error = 0; + return; + } + RealScalar threshold = tol*tol*rhsNorm2; + RealScalar residualNorm2 = normal_residual.squaredNorm(); + if (residualNorm2 < threshold) + { + iters = 0; + tol_error = sqrt(residualNorm2 / rhsNorm2); + return; + } + + VectorType p(n); + p = precond.solve(normal_residual); // initial search direction + + VectorType z(n), tmp(m); + RealScalar absNew = numext::real(normal_residual.dot(p)); // the square of the absolute value of r scaled by invM + Index i = 0; + while(i < maxIters) + { + tmp.noalias() = mat * p; + + Scalar alpha = absNew / tmp.squaredNorm(); // the amount we travel on dir + x += alpha * p; // update solution + residual -= alpha * tmp; // update residual + normal_residual = mat.adjoint() * residual; // update residual of the normal equation + + residualNorm2 = normal_residual.squaredNorm(); + if(residualNorm2 < threshold) + break; + + z = precond.solve(normal_residual); // approximately solve for "A'A z = normal_residual" + + RealScalar absOld = absNew; + absNew = numext::real(normal_residual.dot(z)); // update the absolute value of r + RealScalar beta = absNew / absOld; // calculate the Gram-Schmidt value used to create the new search direction + p = z + beta * p; // update search direction + i++; + } + tol_error = sqrt(residualNorm2 / rhsNorm2); + iters = i; +} + +} + +template< typename _MatrixType, + typename _Preconditioner = LeastSquareDiagonalPreconditioner > +class LSCG; + +namespace internal { + +template< typename _MatrixType, typename _Preconditioner> +struct traits > +{ + typedef _MatrixType MatrixType; + typedef _Preconditioner Preconditioner; +}; + +} + +/** \ingroup IterativeLinearSolvers_Module + * \brief A conjugate gradient solver for sparse (or dense) least-square problems + * + * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm. + * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability. + * Otherwise, the SparseLU or SparseQR classes might be preferable. + * The matrix A and the vectors x and b can be either dense or sparse. + * + * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. + * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner + * + * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() + * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations + * and NumTraits::epsilon() for the tolerance. + * + * This class can be used as the direct solver classes. Here is a typical usage example: + \code + int m=1000000, n = 10000; + VectorXd x(n), b(m); + SparseMatrix A(m,n); + // fill A and b + LSCG > lscg; + lscg.compute(A); + x = lscg.solve(b); + std::cout << "#iterations: " << lscg.iterations() << std::endl; + std::cout << "estimated error: " << lscg.error() << std::endl; + // update b, and solve again + x = lscg.solve(b); + \endcode + * + * By default the iterations start with x=0 as an initial guess of the solution. + * One can control the start using the solveWithGuess() method. + * + * \sa class ConjugateGradient, SparseLU, SparseQR + */ +template< typename _MatrixType, typename _Preconditioner> +class LSCG : public IterativeSolverBase > +{ + typedef IterativeSolverBase Base; + using Base::mp_matrix; + using Base::m_error; + using Base::m_iterations; + using Base::m_info; + using Base::m_isInitialized; +public: + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef _Preconditioner Preconditioner; + +public: + + /** Default constructor. */ + LSCG() : Base() {} + + /** Initialize the solver with matrix \a A for further \c Ax=b solving. + * + * This constructor is a shortcut for the default constructor followed + * by a call to compute(). + * + * \warning this class stores a reference to the matrix A as well as some + * precomputed values that depend on it. Therefore, if \a A is changed + * this class becomes invalid. Call compute() to update it with the new + * matrix A, or modify a copy of A. + */ + explicit LSCG(const MatrixType& A) : Base(A) {} + + ~LSCG() {} + + /** \internal */ + template + void _solve_with_guess_impl(const Rhs& b, Dest& x) const + { + m_iterations = Base::maxIterations(); + m_error = Base::m_tolerance; + + for(Index j=0; j + void _solve_impl(const MatrixBase& b, Dest& x) const + { + x.setZero(); + _solve_with_guess_impl(b.derived(),x); + } + +}; + +} // end namespace Eigen + +#endif // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 168749634..1712b8718 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -234,6 +234,7 @@ ei_add_test(sparse_permutations) ei_add_test(simplicial_cholesky) ei_add_test(conjugate_gradient) ei_add_test(bicgstab) +ei_add_test(lscg) ei_add_test(sparselu) ei_add_test(sparseqr) ei_add_test(umeyama) diff --git a/test/lscg.cpp b/test/lscg.cpp new file mode 100644 index 000000000..599ed5619 --- /dev/null +++ b/test/lscg.cpp @@ -0,0 +1,29 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "sparse_solver.h" +#include + +template void test_lscg_T() +{ + LSCG > lscg_colmajor_diag; + LSCG, IdentityPreconditioner> lscg_colmajor_I; + + CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag) ); + CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I) ); + + CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag) ); + CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I) ); +} + +void test_lscg() +{ + CALL_SUBTEST_1(test_lscg_T()); + CALL_SUBTEST_2(test_lscg_T >()); +} diff --git a/test/sparse_solver.h b/test/sparse_solver.h index f0a4691e3..f266e2c9a 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -17,9 +17,9 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, typedef typename Mat::Scalar Scalar; typedef typename Mat::StorageIndex StorageIndex; - DenseRhs refX = dA.lu().solve(db); + DenseRhs refX = dA.householderQr().solve(db); { - Rhs x(b.rows(), b.cols()); + Rhs x(A.cols(), b.cols()); Rhs oldb = b; solver.compute(A); @@ -94,7 +94,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, // test dense Block as the result and rhs: { - DenseRhs x(db.rows(), db.cols()); + DenseRhs x(refX.rows(), refX.cols()); DenseRhs oldb(db); x.setZero(); x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols())); @@ -119,7 +119,7 @@ void check_sparse_solving_real_cases(Solver& solver, const typename Solver::Matr typedef typename Mat::Scalar Scalar; typedef typename Mat::RealScalar RealScalar; - Rhs x(b.rows(), b.cols()); + Rhs x(A.cols(), b.cols()); solver.compute(A); if (solver.info() != Success) @@ -410,3 +410,53 @@ template void check_sparse_square_abs_determinant(Solver& solve } } +template +void generate_sparse_leastsquare_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag) +{ + typedef typename Solver::MatrixType Mat; + typedef typename Mat::Scalar Scalar; + + int rows = internal::random(1,maxSize); + int cols = internal::random(1,rows); + double density = (std::max)(8./(rows*cols), 0.01); + + A.resize(rows,cols); + dA.resize(rows,cols); + + initSparse(density, dA, A, options); +} + +template void check_sparse_leastsquare_solving(Solver& solver) +{ + typedef typename Solver::MatrixType Mat; + typedef typename Mat::Scalar Scalar; + typedef SparseMatrix SpMat; + typedef Matrix DenseMatrix; + typedef Matrix DenseVector; + + int rhsCols = internal::random(1,16); + + Mat A; + DenseMatrix dA; + for (int i = 0; i < g_repeat; i++) { + generate_sparse_leastsquare_problem(solver, A, dA); + + A.makeCompressed(); + DenseVector b = DenseVector::Random(A.rows()); + DenseMatrix dB(A.rows(),rhsCols); + SpMat B(A.rows(),rhsCols); + double density = (std::max)(8./(A.rows()*rhsCols), 0.1); + initSparse(density, dB, B, ForceNonZeroDiag); + B.makeCompressed(); + check_sparse_solving(solver, A, b, dA, b); + check_sparse_solving(solver, A, dB, dA, dB); + check_sparse_solving(solver, A, B, dA, dB); + + // check only once + if(i==0) + { + b = DenseVector::Zero(A.rows()); + check_sparse_solving(solver, A, b, dA, b); + } + } +} From 3dca4a1efc66d5fce6a6aada7cd7cc248ca8ae40 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Mar 2015 09:35:30 +0100 Subject: [PATCH 21/83] Update manual wrt new LSCG solver. --- Eigen/Sparse | 6 +++--- doc/SparseLinearSystems.dox | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Eigen/Sparse b/Eigen/Sparse index 7cc9c0913..a540f0eec 100644 --- a/Eigen/Sparse +++ b/Eigen/Sparse @@ -11,9 +11,9 @@ * - \ref SparseQR_Module * - \ref IterativeLinearSolvers_Module * - * \code - * #include - * \endcode + \code + #include + \endcode */ #include "SparseCore" diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index 147b55376..13741280a 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -21,6 +21,9 @@ They are summarized in the following table: ConjugateGradient\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlinkClassic iterative CGSPDPreconditionning built-in, MPL2 Recommended for large symmetric problems (e.g., 3D Poisson eq.) +LSCG\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlinkCG for rectangular least-square problemRectangularPreconditionning + built-in, MPL2 + Solve for min |A'Ax-b|^2 without forming A'A BiCGSTAB\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlinkIterative stabilized bi-conjugate gradientSquarePreconditionning built-in, MPL2 To speedup the convergence, try it with the \ref IncompleteLUT preconditioner. From 1ce017836389e5ddef3b0a93cfcbc410db06506f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Mar 2015 09:39:26 +0100 Subject: [PATCH 22/83] Improve efficiency of SparseMatrix::insert/coeffRef for sequential outer-index insertion strategies (bug #974) --- Eigen/src/SparseCore/SparseMatrix.h | 132 ++++++++++++++++++++++++---- 1 file changed, 117 insertions(+), 15 deletions(-) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 4562f3df9..0ba7e111a 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -222,24 +222,18 @@ class SparseMatrix * The non zero coefficient must \b not already exist. * * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed - * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first - * call reserve(const SizesType &) to reserve a more appropriate number of elements per - * inner vector that better match your scenario. + * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier. + * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to be + * inserted by increasing outer-indices. + * + * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to first + * call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector. * - * This function performs a sorted insertion in O(1) if the elements of each inner vector are - * inserted in increasing inner index order, and in O(nnz_j) for a random insertion. + * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1) + * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion. * */ - Scalar& insert(Index row, Index col) - { - eigen_assert(row>=0 && row=0 && col& SparseMatrix +typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insert(Index row, Index col) +{ + eigen_assert(row>=0 && row=0 && col(std::malloc(m_outerSize * sizeof(StorageIndex))); + if(!m_innerNonZeros) internal::throw_std_bad_alloc(); + + memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); + + // pack all inner-vectors to the end of the pre-allocated space + // and allocate the entire free-space to the first inner-vector + StorageIndex end = convert_index(m_data.allocatedSize()); + for(Index j=1; j<=m_outerSize; ++j) + m_outerIndex[j] = end; + } + } + + // check whether we can do a fast "push back" insertion + Index data_end = m_data.allocatedSize(); + + // First case: we are filling a new inner vector which is packed at the end. + // We assume that all remaining inner-vectors are also empty and packed to the end. + if(m_outerIndex[outer]==data_end) + { + eigen_internal_assert(m_innerNonZeros[outer]==0); + + // pack previous empty inner-vectors to end of the used-space + // and allocate the entire free-space to the current inner-vector. + StorageIndex p = convert_index(m_data.size()); + Index j = outer; + while(j>=0 && m_innerNonZeros[j]==0) + m_outerIndex[j--] = p; + + // push back the new element + ++m_innerNonZeros[outer]; + m_data.append(Scalar(0), inner); + + // check for reallocation + if(data_end != m_data.allocatedSize()) + { + // m_data has been reallocated + // -> move remaining inner-vectors back to the end of the free-space + // so that the entire free-space is allocated to the current inner-vector. + eigen_internal_assert(data_end < m_data.allocatedSize()); + StorageIndex new_end = convert_index(m_data.allocatedSize()); + for(Index j=outer+1; j<=m_outerSize; ++j) + if(m_outerIndex[j]==data_end) + m_outerIndex[j] = new_end; + } + return m_data.value(p); + } + + // Second case: the next inner-vector is packed to the end + // and the current inner-vector end match the used-space. + if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size()) + { + eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0); + + // add space for the new element + ++m_innerNonZeros[outer]; + m_data.resize(m_data.size()+1); + + // check for reallocation + if(data_end != m_data.allocatedSize()) + { + // m_data has been reallocated + // -> move remaining inner-vectors back to the end of the free-space + // so that the entire free-space is allocated to the current inner-vector. + eigen_internal_assert(data_end < m_data.allocatedSize()); + StorageIndex new_end = convert_index(m_data.allocatedSize()); + for(Index j=outer+1; j<=m_outerSize; ++j) + if(m_outerIndex[j]==data_end) + m_outerIndex[j] = new_end; + } + + // and insert it at the right position (sorted insertion) + Index startId = m_outerIndex[outer]; + Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1; + while ( (p > startId) && (m_data.index(p-1) > inner) ) + { + m_data.index(p) = m_data.index(p-1); + m_data.value(p) = m_data.value(p-1); + --p; + } + + m_data.index(p) = convert_index(inner); + return (m_data.value(p) = 0); + } + + // make sure the matrix is compatible to random un-compressed insertion: + m_data.resize(m_data.allocatedSize()); + + return insertUncompressed(row,col); +} + template EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col) { From c43154bbc5cd686b52a67b495875337001b54c49 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Mar 2015 10:16:46 +0100 Subject: [PATCH 23/83] Check for no-reallocation in SparseMatrix::insert (bug #974) --- Eigen/src/SparseCore/CompressedStorage.h | 3 +++ test/sparse_basic.cpp | 31 +++++++++++++++++++++--- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index cae9c38b0..49fd46658 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -214,6 +214,9 @@ class CompressedStorage inline void reallocate(Index size) { + #ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN + EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN + #endif eigen_internal_assert(size!=m_allocatedSize); internal::scoped_array newValues(size); internal::scoped_array newIndices(size); diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 8021f4db6..e243964f4 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -9,6 +9,9 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +static long g_realloc_count = 0; +#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++; + #include "sparse.h" template void sparse_basic(const SparseMatrixType& ref) @@ -107,17 +110,31 @@ template void sparse_basic(const SparseMatrixType& re DenseMatrix m1(rows,cols); m1.setZero(); SparseMatrixType m2(rows,cols); - if(internal::random()%2) - m2.reserve(VectorXi::Constant(m2.outerSize(), 2)); + bool call_reserve = internal::random()%2; + Index nnz = internal::random(1,int(rows)/2); + if(call_reserve) + { + if(internal::random()%2) + m2.reserve(VectorXi::Constant(m2.outerSize(), int(nnz))); + else + m2.reserve(m2.outerSize() * nnz); + } + g_realloc_count = 0; for (Index j=0; j(0,rows-1); if (m1.coeff(i,j)==Scalar(0)) m2.insert(i,j) = m1(i,j) = internal::random(); } } + + if(call_reserve && !SparseMatrixType::IsRowMajor) + { + VERIFY(g_realloc_count==0); + } + m2.finalize(); VERIFY_IS_APPROX(m2,m1); } @@ -575,7 +592,7 @@ void big_sparse_triplet(Index rows, Index cols, double density) { void test_sparse_basic() { for(int i = 0; i < g_repeat; i++) { - int r = Eigen::internal::random(1,100), c = Eigen::internal::random(1,100); + int r = Eigen::internal::random(1,200), c = Eigen::internal::random(1,200); if(Eigen::internal::random(0,4) == 0) { r = c; // check square matrices in 25% of tries } @@ -588,6 +605,12 @@ void test_sparse_basic() CALL_SUBTEST_1(( sparse_basic(SparseMatrix(r, c)) )); CALL_SUBTEST_1(( sparse_basic(SparseMatrix(r, c)) )); + r = Eigen::internal::random(1,100); + c = Eigen::internal::random(1,100); + if(Eigen::internal::random(0,4) == 0) { + r = c; // check square matrices in 25% of tries + } + CALL_SUBTEST_1(( sparse_basic(SparseMatrix(short(r), short(c))) )); CALL_SUBTEST_1(( sparse_basic(SparseMatrix(short(r), short(c))) )); } From 168ceb271e631c17da1c722119746ef5931fdca5 Mon Sep 17 00:00:00 2001 From: Jan Blechta Date: Wed, 18 Feb 2015 14:26:10 +0100 Subject: [PATCH 24/83] Really use zero guess in ConjugateGradients::solve as documented and expected for consistency with other methods. --- Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 10cd94783..fab5fdb1f 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -208,7 +208,7 @@ public: template void _solve_impl(const MatrixBase& b, Dest& x) const { - x.setOnes(); + x.setZero(); _solve_with_guess_impl(b.derived(),x); } From 00ea12188167411da9f83bbebedbc8822143eaf0 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 4 Mar 2015 09:30:56 -0500 Subject: [PATCH 25/83] Complete the tool to analyze the efficiency of default sizes. --- bench/analyze-blocking-sizes.cpp | 384 ++++++++++++++++++++--------- bench/benchmark-blocking-sizes.cpp | 9 +- 2 files changed, 274 insertions(+), 119 deletions(-) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index 3316f8cbf..5c26582cc 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -23,20 +23,63 @@ using namespace std; +const int default_precision = 4; + +uint8_t log2_pot(size_t x) { + size_t l = 0; + while (x >>= 1) l++; + return l; +} + +uint16_t compact_size_triple(size_t k, size_t m, size_t n) +{ + return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); +} + +// just a helper to store a triple of K,M,N sizes for matrix product +struct size_triple_t +{ + uint16_t k, m, n; + size_triple_t() : k(0), m(0), n(0) {} + size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} + size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} + size_triple_t(uint16_t compact) + { + k = 1 << ((compact & 0xf00) >> 8); + m = 1 << ((compact & 0x0f0) >> 4); + n = 1 << ((compact & 0x00f) >> 0); + } + bool is_cubic() const { return k == m && m == n; } +}; + +ostream& operator<<(ostream& s, const size_triple_t& t) +{ + return s << "(" << t.k << ", " << t.m << ", " << t.n << ")"; +} + struct inputfile_entry_t { uint16_t product_size; - uint16_t block_size; + uint16_t pot_block_size; + size_triple_t nonpot_block_size; float gflops; }; struct inputfile_t { + enum class type_t { + unknown, + all_pot_sizes, + default_sizes + }; + string filename; vector entries; + type_t type; inputfile_t(const string& fname) : filename(fname) + , type(type_t::unknown) { ifstream stream(filename); if (!stream.is_open()) { @@ -44,52 +87,96 @@ struct inputfile_t exit(1); } string line; - bool is_in_measurements = false; while (getline(stream, line)) { if (line.empty()) continue; - if (line.find("BEGIN MEASUREMENTS") == 0) { - is_in_measurements = true; + if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) { + if (type != type_t::unknown) { + cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines"; + exit(1); + } + type = type_t::all_pot_sizes; continue; } - - if (!is_in_measurements) { + if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) { + if (type != type_t::unknown) { + cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines"; + exit(1); + } + type = type_t::default_sizes; continue; } + - unsigned int product_size, block_size; - float gflops; - int sscanf_result = - sscanf(line.c_str(), "%x %x %f", - &product_size, - &block_size, - &gflops); - if (3 != sscanf_result || - !product_size || - product_size > 0xfff || - !block_size || - block_size > 0xfff || - !isfinite(gflops)) - { - cerr << "ill-formed input file: " << filename << endl; - cerr << "offending line:" << endl << line << endl; - exit(1); + if (type == type_t::unknown) { + continue; + } + switch(type) { + case type_t::all_pot_sizes: { + unsigned int product_size, block_size; + float gflops; + int sscanf_result = + sscanf(line.c_str(), "%x %x %f", + &product_size, + &block_size, + &gflops); + if (3 != sscanf_result || + !product_size || + product_size > 0xfff || + !block_size || + block_size > 0xfff || + !isfinite(gflops)) + { + cerr << "ill-formed input file: " << filename << endl; + cerr << "offending line:" << endl << line << endl; + exit(1); + } + inputfile_entry_t entry; + entry.product_size = uint16_t(product_size); + entry.pot_block_size = uint16_t(block_size); + entry.gflops = gflops; + entries.push_back(entry); + break; + } + case type_t::default_sizes: { + unsigned int product_size; + float gflops; + int bk, bm, bn; + int sscanf_result = + sscanf(line.c_str(), "%x default(%d, %d, %d) %f", + &product_size, + &bk, &bm, &bn, + &gflops); + if (5 != sscanf_result || + !product_size || + product_size > 0xfff || + !isfinite(gflops)) + { + cerr << "ill-formed input file: " << filename << endl; + cerr << "offending line:" << endl << line << endl; + exit(1); + } + inputfile_entry_t entry; + entry.product_size = uint16_t(product_size); + entry.pot_block_size = 0; + entry.nonpot_block_size = size_triple_t(bk, bm, bn); + entry.gflops = gflops; + entries.push_back(entry); + break; + } + + default: + break; } - inputfile_entry_t entry; - entry.product_size = uint16_t(product_size); - entry.block_size = uint16_t(block_size); - entry.gflops = gflops; - entries.push_back(entry); } stream.close(); - if (!is_in_measurements) { - cerr << "Input file " << filename << " didn't contain a BEGIN MEASUREMENTS line. Wrong file?" << endl; + if (type == type_t::unknown) { + cerr << "Unrecognized input file " << filename << endl; exit(1); } if (entries.empty()) { cerr << "didn't find any measurements in input file: " << filename << endl; exit(1); } - //cerr << "read " << entries.size() << " measurements from " << filename << endl; } }; @@ -114,6 +201,9 @@ struct preprocessed_inputfile_t preprocessed_inputfile_t(const inputfile_t& inputfile) : filename(inputfile.filename) { + if (inputfile.type != inputfile_t::type_t::all_pot_sizes) { + abort(); + } auto it = inputfile.entries.begin(); auto it_first_with_given_product_size = it; while (it != inputfile.entries.end()) { @@ -145,7 +235,7 @@ private: for (auto it = begin; it != end; ++it) { preprocessed_inputfile_entry_t entry; entry.product_size = it->product_size; - entry.block_size = it->block_size; + entry.block_size = it->pot_block_size; entry.efficiency = it->gflops / max_gflops; entries.push_back(entry); } @@ -415,15 +505,44 @@ void print_partition( struct action_t { virtual const char* invokation_name() const { abort(); return nullptr; } - virtual void run(const vector& preprocessed_inputfiles) const { abort(); } + virtual void run(int, char*[]) const { abort(); } virtual ~action_t() {} }; struct partition_action_t : action_t { virtual const char* invokation_name() const { return "partition"; } - virtual void run(const vector& preprocessed_inputfiles) const + virtual void run(int argc, char *argv[]) const { + vector preprocessed_inputfiles; + + if (!argc) { + cerr << "The " << invokation_name() << " action needs a list of input files." << endl; + exit(1); + } + + vector inputfilenames; + for (int i = 0; i < argc; i++) { + inputfilenames.emplace_back(argv[i]); + } + + for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) { + inputfile_t inputfile(*it); + switch (inputfile.type) { + case inputfile_t::type_t::all_pot_sizes: + preprocessed_inputfiles.emplace_back(inputfile); + break; + case inputfile_t::type_t::default_sizes: + cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and " + << "has no use for " << *it << " which contains measurements for default sizes." << endl; + exit(1); + break; + default: + cerr << "Unrecognized input file: " << *it << endl; + exit(1); + } + } + check_all_files_in_same_exact_order(preprocessed_inputfiles); float required_efficiency_to_beat = 0.0f; @@ -467,89 +586,132 @@ struct partition_action_t : action_t } }; -uint8_t log2_pot(size_t x) { - size_t l = 0; - while (x >>= 1) l++; - return l; -} - -uint16_t compact_size_triple(size_t k, size_t m, size_t n) -{ - return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); -} - -// just a helper to store a triple of K,M,N sizes for matrix product -struct size_triple_t -{ - size_t k, m, n; - size_triple_t() : k(0), m(0), n(0) {} - size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} - size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} - size_triple_t(uint16_t compact) - { - k = 1 << ((compact & 0xf00) >> 8); - m = 1 << ((compact & 0x0f0) >> 4); - n = 1 << ((compact & 0x00f) >> 0); - } - bool is_cubic() const { return k == m && m == n; } -}; - struct evaluate_defaults_action_t : action_t { - virtual const char* invokation_name() const { return "evaluate-defaults"; } - virtual void run(const vector& preprocessed_inputfiles) const + struct results_entry_t { + uint16_t product_size; + size_triple_t default_block_size; + uint16_t best_pot_block_size; + float default_gflops; + float best_pot_gflops; + float default_efficiency; + }; + friend ostream& operator<<(ostream& s, const results_entry_t& entry) { - if (preprocessed_inputfiles.size() > 1) { - cerr << invokation_name() << " only works with one input file." << endl; - exit(1); + return s + << "Product size " << size_triple_t(entry.product_size) + << ": default block size " << entry.default_block_size + << " -> " << entry.default_gflops + << " GFlop/s = " << entry.default_efficiency * 100.0f << " %" + << " of best POT block size " << size_triple_t(entry.best_pot_block_size) + << " -> " << entry.best_pot_gflops + << " GFlop/s" << dec; + } + static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) { + return e1.default_efficiency < e2.default_efficiency; + } + virtual const char* invokation_name() const { return "evaluate-defaults"; } + void show_usage_and_exit() const + { + cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl; + cerr << "checks how well the performance with default sizes compares to the best " + << "performance measured over all POT sizes." << endl; + exit(1); + } + virtual void run(int argc, char *argv[]) const + { + if (argc != 2) { + show_usage_and_exit(); } - - const preprocessed_inputfile_t& preprocessed_inputfile = preprocessed_inputfiles.front(); - + inputfile_t inputfile_default_sizes(argv[0]); + inputfile_t inputfile_all_pot_sizes(argv[1]); + if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) { + cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl; + show_usage_and_exit(); + } + if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) { + cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl; + show_usage_and_exit(); + } + vector results; + vector cubic_results; + uint16_t product_size = 0; - uint16_t default_block_size = 0; - vector results, cubic_results; - for (auto it = preprocessed_inputfile.entries.begin(); it != preprocessed_inputfile.entries.end(); ++it) { - if (it->product_size != product_size) { - product_size = it->product_size; - size_triple_t product_size_triple(product_size); - Eigen::Index k = product_size_triple.k, - m = product_size_triple.m, - n = product_size_triple.n; - Eigen::internal::computeProductBlockingSizes(k, m, n); - default_block_size = compact_size_triple(k, m, n); + auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin(); + for (auto it_default_sizes = inputfile_default_sizes.entries.begin(); + it_default_sizes != inputfile_default_sizes.entries.end(); + ++it_default_sizes) + { + if (it_default_sizes->product_size == product_size) { + continue; } - if (it->block_size == default_block_size) { - results.push_back(*it); - if (size_triple_t(product_size).is_cubic()) { - cubic_results.push_back(*it); + product_size = it_default_sizes->product_size; + while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() && + it_all_pot_sizes->product_size != product_size) + { + ++it_all_pot_sizes; + } + if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) { + break; + } + uint16_t best_pot_block_size = 0; + float best_pot_gflops = 0; + for (auto it = it_all_pot_sizes; + it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size; + ++it) + { + if (it->gflops > best_pot_gflops) { + best_pot_gflops = it->gflops; + best_pot_block_size = it->pot_block_size; } } + results_entry_t entry; + entry.product_size = product_size; + entry.default_block_size = it_default_sizes->nonpot_block_size; + entry.best_pot_block_size = best_pot_block_size; + entry.default_gflops = it_default_sizes->gflops; + entry.best_pot_gflops = best_pot_gflops; + entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops; + results.push_back(entry); + + size_triple_t t(product_size); + if (t.k == t.m && t.m == t.n) { + cubic_results.push_back(entry); + } } - cerr << "Below are all results - first column is product size tripe, " << endl - << "second column is block size triple, in hex kmn form where" << endl - << "k, m, n are the log2 of the actual values, i.e. a89 means" << endl - << "k=1024, m=256, n=512." << endl << endl; + cerr << "All results:" << endl; for (auto it = results.begin(); it != results.end(); ++it) { - cerr << hex << it->product_size << " " << it->block_size - << " efficiency " << std::dec << 100.0f * it->efficiency << " %" << endl; + cerr << *it << endl; } cerr << endl; + sort(results.begin(), results.end(), lower_efficiency); - sort(cubic_results.begin(), cubic_results.end(), lower_efficiency); - cerr << "Efficiency summary: min = " - << 100.0f * results.front().efficiency << " %, max = " - << 100.0f * results.back().efficiency << " %, median = " - << 100.0f * results[results.size() / 2].efficiency << " %" << endl; - cerr << "20% of product sizes have efficiency <= " << 100.0f * results[results.size() * 20 / 100].efficiency << " %" << endl; - cerr << "10% of product sizes have efficiency <= " << 100.0f * results[results.size() * 10 / 100].efficiency << " %" << endl; - cerr << "5% of product sizes have efficiency <= " << 100.0f * results[results.size() * 5 / 100].efficiency << " %" << endl; - cerr << "Cubic sizes efficiency summary: min = " - << 100.0f * cubic_results.front().efficiency << " %, max = " - << 100.0f * cubic_results.back().efficiency << " %, median = " - << 100.0f * cubic_results[cubic_results.size() / 2].efficiency << " %" << endl; + const size_t n = min(20, results.size()); + cerr << n << " worst results:" << endl; + for (size_t i = 0; i < n; i++) { + cerr << results[i] << endl; + } + cerr << endl; + + cerr << "cubic results:" << endl; + for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) { + cerr << *it << endl; + } + cerr << endl; + + sort(cubic_results.begin(), cubic_results.end(), lower_efficiency); + + cerr.precision(2); + vector a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f}; + for (auto it = a.begin(); it != a.end(); ++it) { + size_t n = min(results.size() - 1, size_t(*it * results.size())); + cerr << (100.0f * n / (results.size() - 1)) + << " % of product sizes have default efficiency <= " + << 100.0f * results[n].default_efficiency << " %" << endl; + } + cerr.precision(default_precision); } }; @@ -568,8 +730,8 @@ void show_usage_and_exit(int argc, char* argv[], int main(int argc, char* argv[]) { - cout.precision(4); - cerr.precision(4); + cout.precision(default_precision); + cerr.precision(default_precision); vector> available_actions; available_actions.emplace_back(new partition_action_t); @@ -577,7 +739,7 @@ int main(int argc, char* argv[]) auto action = available_actions.end(); - if (argc <= 2) { + if (argc < 2) { show_usage_and_exit(argc, argv, available_actions); } for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { @@ -591,15 +753,5 @@ int main(int argc, char* argv[]) show_usage_and_exit(argc, argv, available_actions); } - vector inputfilenames; - for (int i = 2; i < argc; i++) { - inputfilenames.emplace_back(argv[i]); - } - - vector preprocessed_inputfiles; - for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) { - preprocessed_inputfiles.emplace_back(inputfile_t(*it)); - } - - (*action)->run(preprocessed_inputfiles); + (*action)->run(argc - 2, argv + 2); } diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index f1fef1ede..6925cf560 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -122,7 +122,10 @@ ostream& operator<<(ostream& s, const benchmark_t& b) { s << hex << b.compact_product_size << dec; if (b.use_default_block_size) { - s << " default"; + size_triple_t t(b.compact_product_size); + Index k = t.k, m = t.m, n = t.n; + internal::computeProductBlockingSizes(k, m, n); + s << " default(" << k << ", " << m << ", " << n << ")"; } else { s << " " << hex << b.compact_block_size << dec; } @@ -355,7 +358,7 @@ void run_benchmarks(vector& benchmarks) struct measure_all_pot_sizes_action_t : action_t { - virtual const char* invokation_name() const { return "measure-all-pot-sizes"; } + virtual const char* invokation_name() const { return "all-pot-sizes"; } virtual void run() const { vector benchmarks; @@ -386,7 +389,7 @@ struct measure_all_pot_sizes_action_t : action_t struct measure_default_sizes_action_t : action_t { - virtual const char* invokation_name() const { return "measure-default-sizes"; } + virtual const char* invokation_name() const { return "default-sizes"; } virtual void run() const { vector benchmarks; From 2231b3dece9260ee0ba84e9080f11d2015ed5d90 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 4 Mar 2015 09:45:12 -0500 Subject: [PATCH 26/83] output to cout, not cerr, the actual results --- bench/analyze-blocking-sizes.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index 5c26582cc..7aae57d93 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -680,38 +680,38 @@ struct evaluate_defaults_action_t : action_t } } - cerr << "All results:" << endl; + cout << "All results:" << endl; for (auto it = results.begin(); it != results.end(); ++it) { - cerr << *it << endl; + cout << *it << endl; } - cerr << endl; + cout << endl; sort(results.begin(), results.end(), lower_efficiency); const size_t n = min(20, results.size()); - cerr << n << " worst results:" << endl; + cout << n << " worst results:" << endl; for (size_t i = 0; i < n; i++) { - cerr << results[i] << endl; + cout << results[i] << endl; } - cerr << endl; + cout << endl; - cerr << "cubic results:" << endl; + cout << "cubic results:" << endl; for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) { - cerr << *it << endl; + cout << *it << endl; } - cerr << endl; + cout << endl; sort(cubic_results.begin(), cubic_results.end(), lower_efficiency); - cerr.precision(2); + cout.precision(2); vector a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f}; for (auto it = a.begin(); it != a.end(); ++it) { size_t n = min(results.size() - 1, size_t(*it * results.size())); - cerr << (100.0f * n / (results.size() - 1)) + cout << (100.0f * n / (results.size() - 1)) << " % of product sizes have default efficiency <= " << 100.0f * results[n].default_efficiency << " %" << endl; } - cerr.precision(default_precision); + cout.precision(default_precision); } }; From 2dc968e453e347966b7a45c9c497c6b1d3845f80 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Mar 2015 17:03:13 +0100 Subject: [PATCH 27/83] bug #824: improve accuracy of Quaternion::angularDistance using atan2 instead of acos. --- Eigen/src/Geometry/Quaternion.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index 8c44df699..e90ce77eb 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -680,12 +680,10 @@ template inline typename internal::traits::Scalar QuaternionBase::angularDistance(const QuaternionBase& other) const { - using std::acos; + using std::atan2; using std::abs; - Scalar d = abs(this->dot(other)); - if (d>=Scalar(1)) - return Scalar(0); - return Scalar(2) * acos(d); + Quaternion d = (*this) * other.conjugate(); + return Scalar(2) * atan2( d.vec().norm(), abs(d.w()) ); } From 755010702856250c5a63c9bcde305ebe1f5e61fa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 5 Mar 2015 10:03:46 +0100 Subject: [PATCH 28/83] Product optimization: implement a dynamic loop-swapping startegy to improve memory accesses to the destination matrix in the case of K-rank-update like products, i.e., for products of the kind: "large x small" * "small x large" --- .../Core/products/GeneralBlockPanelKernel.h | 57 +++++++++++++++---- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index df9f66995..8f4ee4dbb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -952,14 +952,28 @@ void gebp_kernel=3*Traits::LhsProgress) { +#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION + const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION; +#else + const bool swap_loops = depth<48; +#endif + + Index bound1 = swap_loops ? packet_cols4 : peeled_mc3; + Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3; + Index incr1 = swap_loops ? nr : 3*Traits::LhsProgress; + Index incr2 = !swap_loops ? nr : 3*Traits::LhsProgress; + PossiblyRotatingKernelHelper possiblyRotatingKernelHelper(traits); - + // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) - for(Index i=0; i=2*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (2*LhsProgress x depth) - for(Index i=peeled_mc3; i Date: Thu, 5 Mar 2015 10:16:32 +0100 Subject: [PATCH 29/83] Rename LSCG to LeastSquaresConjugateGradient --- Eigen/IterativeLinearSolvers | 2 +- .../BasicPreconditioners.h | 4 ++-- .../IterativeLinearSolvers/ConjugateGradient.h | 2 +- .../LeastSquareConjugateGradient.h | 16 ++++++++-------- test/lscg.cpp | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers index 0594feb41..7fab9eed0 100644 --- a/Eigen/IterativeLinearSolvers +++ b/Eigen/IterativeLinearSolvers @@ -12,7 +12,7 @@ * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse. * Those solvers are accessible via the following classes: * - ConjugateGradient for selfadjoint (hermitian) matrices, - * - LSCG for rectangular least-square problems, + * - LeastSquaresConjugateGradient for rectangular least-square problems, * - BiCGSTAB for general square matrices. * * These iterative solvers are associated with some preconditioners: diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 6da423cf6..3710a8209 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -102,7 +102,7 @@ class DiagonalPreconditioner }; /** \ingroup IterativeLinearSolvers_Module - * \brief Jacobi preconditioner for LSCG + * \brief Jacobi preconditioner for LeastSquaresConjugateGradient * * This class allows to approximately solve for A' A x = A' b problems assuming A' A is a diagonal matrix. * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for: @@ -114,7 +114,7 @@ class DiagonalPreconditioner * * The diagonal entries are pre-inverted and stored into a dense vector. * - * \sa class LSCG, class DiagonalPreconditioner + * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner */ template class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index fab5fdb1f..11b8347f7 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -139,7 +139,7 @@ struct traits > * By default the iterations start with x=0 as an initial guess of the solution. * One can control the start using the solveWithGuess() method. * - * \sa class LSCG, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner + * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ template< typename _MatrixType, int _UpLo, typename _Preconditioner> class ConjugateGradient : public IterativeSolverBase > diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h index beaf5c307..1d819927e 100644 --- a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h @@ -95,12 +95,12 @@ void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest template< typename _MatrixType, typename _Preconditioner = LeastSquareDiagonalPreconditioner > -class LSCG; +class LeastSquaresConjugateGradient; namespace internal { template< typename _MatrixType, typename _Preconditioner> -struct traits > +struct traits > { typedef _MatrixType MatrixType; typedef _Preconditioner Preconditioner; @@ -129,7 +129,7 @@ struct traits > VectorXd x(n), b(m); SparseMatrix A(m,n); // fill A and b - LSCG > lscg; + LeastSquaresConjugateGradient > lscg; lscg.compute(A); x = lscg.solve(b); std::cout << "#iterations: " << lscg.iterations() << std::endl; @@ -144,9 +144,9 @@ struct traits > * \sa class ConjugateGradient, SparseLU, SparseQR */ template< typename _MatrixType, typename _Preconditioner> -class LSCG : public IterativeSolverBase > +class LeastSquaresConjugateGradient : public IterativeSolverBase > { - typedef IterativeSolverBase Base; + typedef IterativeSolverBase Base; using Base::mp_matrix; using Base::m_error; using Base::m_iterations; @@ -161,7 +161,7 @@ public: public: /** Default constructor. */ - LSCG() : Base() {} + LeastSquaresConjugateGradient() : Base() {} /** Initialize the solver with matrix \a A for further \c Ax=b solving. * @@ -173,9 +173,9 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - explicit LSCG(const MatrixType& A) : Base(A) {} + explicit LeastSquaresConjugateGradient(const MatrixType& A) : Base(A) {} - ~LSCG() {} + ~LeastSquaresConjugateGradient() {} /** \internal */ template diff --git a/test/lscg.cpp b/test/lscg.cpp index 599ed5619..daa62a954 100644 --- a/test/lscg.cpp +++ b/test/lscg.cpp @@ -12,8 +12,8 @@ template void test_lscg_T() { - LSCG > lscg_colmajor_diag; - LSCG, IdentityPreconditioner> lscg_colmajor_I; + LeastSquaresConjugateGradient > lscg_colmajor_diag; + LeastSquaresConjugateGradient, IdentityPreconditioner> lscg_colmajor_I; CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag) ); CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I) ); From 5db2baa573965ad8d059038336e272aaab2d025b Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Thu, 5 Mar 2015 13:44:20 -0500 Subject: [PATCH 30/83] Make benchmark-blocking-sizes detect changes to clock speed and be resilient to that. --- bench/benchmark-blocking-sizes.cpp | 262 +++++++++++++++++++++++------ 1 file changed, 209 insertions(+), 53 deletions(-) diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 6925cf560..0bf9c07f8 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -37,16 +37,18 @@ const int measurement_repetitions = 3; // Timings below this value are too short to be accurate, // we'll repeat measurements with more iterations until // we get a timing above that threshold. -const float min_accurate_time = 1e-2f; +const float g_min_accurate_time = 1e-2f; // See --min-working-set-size command line parameter. -size_t min_working_set_size = 0; +size_t g_min_working_set_size = 0; // range of sizes that we will benchmark (in all 3 K,M,N dimensions) const size_t maxsize = 2048; const size_t minsize = 16; typedef MatrixXf MatrixType; +typedef MatrixType::Scalar Scalar; +typedef internal::packet_traits::type Packet; static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two"); static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two"); @@ -95,24 +97,24 @@ struct benchmark_t uint16_t compact_block_size; bool use_default_block_size; float gflops; - benchmark_t() - : compact_product_size(0) - , compact_block_size(0) - , gflops(0) - , use_default_block_size(false) - {} + size_t min_working_set_size; + float min_accurate_time; benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(compact_size_triple(bk, bm, bn)) , use_default_block_size(false) , gflops(0) + , min_working_set_size(g_min_working_set_size) + , min_accurate_time(g_min_accurate_time) {} benchmark_t(size_t pk, size_t pm, size_t pn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(0) , use_default_block_size(true) , gflops(0) + , min_working_set_size(g_min_working_set_size) + , min_accurate_time(g_min_accurate_time) {} void run(); @@ -124,7 +126,7 @@ ostream& operator<<(ostream& s, const benchmark_t& b) if (b.use_default_block_size) { size_triple_t t(b.compact_product_size); Index k = t.k, m = t.m, n = t.n; - internal::computeProductBlockingSizes(k, m, n); + internal::computeProductBlockingSizes(k, m, n); s << " default(" << k << ", " << m << ", " << n << ")"; } else { s << " " << hex << b.compact_block_size << dec; @@ -162,7 +164,7 @@ void benchmark_t::run() // set up the matrix pool const size_t combined_three_matrices_sizes = - sizeof(MatrixType::Scalar) * + sizeof(Scalar) * (productsizes.k * productsizes.m + productsizes.k * productsizes.n + productsizes.m * productsizes.n); @@ -267,7 +269,7 @@ struct action_t virtual ~action_t() {} }; -void show_usage_and_exit(int argc, char* argv[], +void show_usage_and_exit(int /*argc*/, char* argv[], const vector>& available_actions) { cerr << "usage: " << argv[0] << " [options...]" << endl << endl; @@ -287,54 +289,204 @@ void show_usage_and_exit(int argc, char* argv[], cerr << " avoid warm caches." << endl; exit(1); } - -void run_benchmarks(vector& benchmarks) + +float measure_clock_speed() { - // randomly shuffling benchmarks allows us to get accurate enough progress info, - // as now the cheap/expensive benchmarks are randomly mixed so they average out. - random_shuffle(benchmarks.begin(), benchmarks.end()); + cerr << "Measuring clock speed... \r" << flush; + + vector all_gflops; + for (int i = 0; i < 8; i++) { + // a good measure of clock speed is obtained by benchmarking small matrices that + // fit in L1 cache and use warm caches (min_working_set_size = 1). + benchmark_t b(128, 128, 128); + b.min_working_set_size = 1; + b.min_accurate_time = 0.1f; // long-running for better accuracy + b.run(); + all_gflops.push_back(b.gflops); + } - // timings here are only used to display progress info. - // Whence the use of real time. - double time_start = timer.getRealTime(); - double time_last_progress_update = time_start; - for (size_t i = 0; i < benchmarks.size(); i++) { - // Display progress info on stderr - double time_now = timer.getRealTime(); - if (time_now > time_last_progress_update + 1.0f) { - time_last_progress_update = time_now; - float ratio_done = float(i) / benchmarks.size(); - cerr.precision(3); - cerr << "Measurements... " << 100.0f * ratio_done - << " %"; + sort(all_gflops.begin(), all_gflops.end()); + float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5]; - if (i > 10) { - cerr << ", ETA "; - int eta = int(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done); - int eta_remainder = eta; - if (eta_remainder > 3600) { - int hours = eta_remainder / 3600; - cerr << hours << " h "; - eta_remainder -= hours * 3600; + // multiply by an arbitrary constant to discourage trying doing anything with the + // returned values besides just comparing them with each other. + float result = stable_estimate * 123.456f; + + return result; +} + +struct human_duration_t +{ + int seconds; + human_duration_t(int s) : seconds(s) {} +}; + +ostream& operator<<(ostream& s, const human_duration_t& d) +{ + int remainder = d.seconds; + if (remainder > 3600) { + int hours = remainder / 3600; + s << hours << " h "; + remainder -= hours * 3600; + } + if (remainder > 60) { + int minutes = remainder / 60; + s << minutes << " min "; + remainder -= minutes * 60; + } + if (d.seconds < 600) { + s << remainder << " s"; + } + return s; +} + +void try_run_some_benchmarks( + vector& benchmarks, + double time_start, + size_t& first_benchmark_to_run, + float& max_clock_speed) +{ + if (first_benchmark_to_run == benchmarks.size()) { + return; + } + + double time_last_progress_update = 0; + double time_last_clock_speed_measurement = 0; + double time_now = 0; + + size_t benchmark_index = first_benchmark_to_run; + + while (true) { + float ratio_done = float(benchmark_index) / benchmarks.size(); + time_now = timer.getRealTime(); + + // We check clock speed every minute and at the end. + if (benchmark_index == benchmarks.size() || + time_now > time_last_clock_speed_measurement + 60.0f) + { + time_last_clock_speed_measurement = time_now; + + // Ensure that clock speed is as expected + float current_clock_speed = measure_clock_speed(); + + // we only allow 1% higher clock speeds, because we want to know the + // clock speed with good accuracy, and this should only cause restarts + // at the beginning of the benchmarks run. + const float tolerance_higher_clock_speed = 1.01f; + + if (current_clock_speed > tolerance_higher_clock_speed * max_clock_speed) { + // Clock speed is now higher than we previously measured. + // Either our initial measurement was inaccurate, which won't happen + // too many times as we are keeping the best clock speed value and + // and allowing some tolerance; or something really weird happened, + // which invalidates all benchmark results collected so far. + // Either way, we better restart all over again now. + if (benchmark_index) { + cerr << "Restarting at " << 100.0f * ratio_done + << " % because clock speed increased. " << endl; } - if (eta_remainder > 60) { - int minutes = eta_remainder / 60; - cerr << minutes << " min "; - eta_remainder -= minutes * 60; - } - if (eta < 600 && eta_remainder) { - cerr << eta_remainder << " s"; + max_clock_speed = current_clock_speed; + first_benchmark_to_run = 0; + return; + } + + // we are a bit more tolerant to lower clock speeds because we don't want + // to cause sleeps and reruns all the time. + const float tolerance_lower_clock_speed = 0.98f; + bool rerun_last_tests = false; + + if (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) { + cerr << "Measurements completed so far: " + << 100.0f * ratio_done + << " % " << endl; + cerr << "Clock speed seems to be only " + << current_clock_speed/max_clock_speed + << " times what it used to be." << endl; + + unsigned int seconds_to_sleep_if_lower_clock_speed = 1; + + while (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) { + if (seconds_to_sleep_if_lower_clock_speed > 300) { + cerr << "Sleeping longer probably won't make a difference. Giving up." << endl; + cerr << "Things to try:" << endl; + cerr << " 1. Check if the device is in some energy-saving state." << endl; + cerr << " On Android, it may help to enable 'Stay Awake' in the dev settings." << endl; + cerr << " 2. Check if the device is overheating." << endl; + cerr << " On some devices, system temperature is reported in" << endl; + cerr << " /sys/class/thermal/thermal_zone*/temp" << endl; + cerr << " 3. Some system daemon might be playing with clock speeds." << endl; + cerr << " In particular, on Qualcomm devices, disable mpdecision " << endl; + cerr << " by renaming /system/bin/mpdecision and rebooting." << endl; + cerr << " 4. CPU frequency scaling might conceivably be the problem." << endl; + cerr << " In particular, Intel Turbo Boost. Try disabling that." << endl; + exit(1); + } + rerun_last_tests = true; + cerr << "Sleeping " + << seconds_to_sleep_if_lower_clock_speed + << " s..." << endl; + sleep(seconds_to_sleep_if_lower_clock_speed); + current_clock_speed = measure_clock_speed(); + seconds_to_sleep_if_lower_clock_speed *= 2; } } - cerr << " \r" << flush; + + if (rerun_last_tests) { + cerr << "Redoing the last " + << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size() + << " % because clock speed had been low. " << endl; + return; + } + + // nothing wrong with the clock speed so far, so there won't be a need to rerun + // benchmarks run so far in case we later encounter a lower clock speed. + first_benchmark_to_run = benchmark_index; + } + + if (benchmark_index == benchmarks.size()) { + // We're done! + first_benchmark_to_run = benchmarks.size(); + // Erase progress info + cerr << " " << endl; + return; + } + + // Display progress info on stderr + if (time_now > time_last_progress_update + 1.0f) { + time_last_progress_update = time_now; + cerr << "Measurements... " << 100.0f * ratio_done + << " %, ETA " + << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done) + << " \r" << flush; } // This is where we actually run a benchmark! - benchmarks[i].run(); + benchmarks[benchmark_index].run(); + benchmark_index++; } +} - // Erase progress info - cerr << " " << endl; +void run_benchmarks(vector& benchmarks) +{ + // Randomly shuffling benchmarks allows us to get accurate enough progress info, + // as now the cheap/expensive benchmarks are randomly mixed so they average out. + // It also means that if data is corrupted for some time span, the odds are that + // not all repetitions of a given benchmark will be corrupted. + random_shuffle(benchmarks.begin(), benchmarks.end()); + + float max_clock_speed = 0.0f; + for (int i = 0; i < 4; i++) { + max_clock_speed = max(max_clock_speed, measure_clock_speed()); + } + + double time_start = timer.getRealTime(); + size_t first_benchmark_to_run = 0; + while (first_benchmark_to_run < benchmarks.size()) { + try_run_some_benchmarks(benchmarks, + time_start, + first_benchmark_to_run, + max_clock_speed); + } // Sort timings by increasing benchmark parameters, and decreasing gflops. // The latter is very important. It means that we can ignore all but the first @@ -414,6 +566,7 @@ struct measure_default_sizes_action_t : action_t int main(int argc, char* argv[]) { + double time_start = timer.getRealTime(); cout.precision(4); cerr.precision(4); @@ -440,7 +593,7 @@ int main(int argc, char* argv[]) for (int i = 2; i < argc; i++) { if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { const char* equals_sign = strchr(argv[i], '='); - min_working_set_size = strtoul(equals_sign+1, nullptr, 10); + g_min_working_set_size = strtoul(equals_sign+1, nullptr, 10); } else { cerr << "unrecognized option: " << argv[i] << endl << endl; show_usage_and_exit(argc, argv, available_actions); @@ -451,17 +604,20 @@ int main(int argc, char* argv[]) cout << "benchmark parameters:" << endl; cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; - cout << "scalar type: " << type_name() << endl; + cout << "scalar type: " << type_name() << endl; cout << "packet size: " << internal::packet_traits::size << endl; cout << "minsize = " << minsize << endl; cout << "maxsize = " << maxsize << endl; cout << "measurement_repetitions = " << measurement_repetitions << endl; - cout << "min_accurate_time = " << min_accurate_time << endl; - cout << "min_working_set_size = " << min_working_set_size; - if (min_working_set_size == 0) { + cout << "g_min_accurate_time = " << g_min_accurate_time << endl; + cout << "g_min_working_set_size = " << g_min_working_set_size; + if (g_min_working_set_size == 0) { cout << " (try to outsize caches)"; } cout << endl << endl; (*action)->run(); + + double time_end = timer.getRealTime(); + cerr << "Finished in " << human_duration_t(time_end - time_start) << endl; } From 4ab01f7c21640f2fb82ecd0877b2d6c662aae7de Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Thu, 5 Mar 2015 14:41:16 -0500 Subject: [PATCH 31/83] slightly increase tolerance to clock speed variation --- bench/benchmark-blocking-sizes.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 0bf9c07f8..316aa9d4d 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -369,12 +369,14 @@ void try_run_some_benchmarks( // Ensure that clock speed is as expected float current_clock_speed = measure_clock_speed(); - // we only allow 1% higher clock speeds, because we want to know the - // clock speed with good accuracy, and this should only cause restarts - // at the beginning of the benchmarks run. - const float tolerance_higher_clock_speed = 1.01f; + // The tolerance needs to be smaller than the relative difference between + // clock speeds that a device could operate under. + // It seems unlikely that a device would be throttling clock speeds by + // amounts smaller than 2%. + // With a value of 1%, I was getting within noise on a Sandy Bridge. + const float clock_speed_tolerance = 0.02f; - if (current_clock_speed > tolerance_higher_clock_speed * max_clock_speed) { + if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { // Clock speed is now higher than we previously measured. // Either our initial measurement was inaccurate, which won't happen // too many times as we are keeping the best clock speed value and @@ -390,12 +392,9 @@ void try_run_some_benchmarks( return; } - // we are a bit more tolerant to lower clock speeds because we don't want - // to cause sleeps and reruns all the time. - const float tolerance_lower_clock_speed = 0.98f; bool rerun_last_tests = false; - if (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) { + if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { cerr << "Measurements completed so far: " << 100.0f * ratio_done << " % " << endl; @@ -405,7 +404,7 @@ void try_run_some_benchmarks( unsigned int seconds_to_sleep_if_lower_clock_speed = 1; - while (current_clock_speed < tolerance_lower_clock_speed * max_clock_speed) { + while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { if (seconds_to_sleep_if_lower_clock_speed > 300) { cerr << "Sleeping longer probably won't make a difference. Giving up." << endl; cerr << "Things to try:" << endl; @@ -479,9 +478,12 @@ void run_benchmarks(vector& benchmarks) max_clock_speed = max(max_clock_speed, measure_clock_speed()); } - double time_start = timer.getRealTime(); + double time_start = 0.0; size_t first_benchmark_to_run = 0; while (first_benchmark_to_run < benchmarks.size()) { + if (first_benchmark_to_run == 0) { + time_start = timer.getRealTime(); + } try_run_some_benchmarks(benchmarks, time_start, first_benchmark_to_run, From 58740ce4c60c9230f1e030ae45508dba10ba1211 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 10:30:35 +0100 Subject: [PATCH 32/83] Improve product kernel: replace the previous dynamic loop swaping strategy by a more general one: It consists in increasing the actual number of rows of lhs's micro horizontal panel for small depth such that L1 cache is fully exploited. --- .../Core/products/GeneralBlockPanelKernel.h | 87 ++++++++----------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8f4ee4dbb..72957cdc8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -230,6 +230,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads { // So far, no blocking at all, i.e., kc==k, and nc==n. // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete. Index problem_size = k*n*sizeof(LhsScalar); Index actual_lm = actual_l2; Index max_mc = m; @@ -951,33 +952,28 @@ void gebp_kernel=3*Traits::LhsProgress) - { -#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION - const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION; -#else - const bool swap_loops = depth<48; -#endif - - Index bound1 = swap_loops ? packet_cols4 : peeled_mc3; - Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3; - Index incr1 = swap_loops ? nr : 3*Traits::LhsProgress; - Index incr2 = !swap_loops ? nr : 3*Traits::LhsProgress; - + { PossiblyRotatingKernelHelper possiblyRotatingKernelHelper(traits); - - // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) - // and on each largest micro vertical panel of rhs (depth * nr) - for(Index it1=0; it1=2*Traits::LhsProgress) { -#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION - const bool swap_loops = (mr<3*Traits::LhsProgress) && (EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION); -#else - const bool swap_loops = (mr<3*Traits::LhsProgress) && (depth<48); -#endif - Index start1 = swap_loops ? 0 : peeled_mc3; - Index start2 = !swap_loops ? 0 : peeled_mc3; - Index bound1 = swap_loops ? packet_cols4 : peeled_mc2; - Index bound2 = !swap_loops ? packet_cols4 : peeled_mc2; - Index incr1 = swap_loops ? nr : 2*Traits::LhsProgress; - Index incr2 = !swap_loops ? nr : 2*Traits::LhsProgress; - - for(Index it1=start1; it1 Date: Fri, 6 Mar 2015 11:47:13 +0100 Subject: [PATCH 33/83] Update gemm performance monitoring tool: - permit to recompute a subset of changesets - update changeset list - add a few more cases --- bench/perf_monitoring/gemm/changesets.txt | 67 ++++++++++++----------- bench/perf_monitoring/gemm/run_gemm.sh | 47 +++++++++++++--- bench/perf_monitoring/gemm/settings.txt | 4 ++ 3 files changed, 78 insertions(+), 40 deletions(-) diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index f19b4287d..b379d7bd2 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -1,39 +1,42 @@ -3.0.1 -3.1.1 -3.2.0 +#3.0.1 +#3.1.1 +#3.2.0 3.2.4 -5745:37f59e65eb6c -5891:d8652709345d -5893:24b4dc92c6d3 -5895:997c2ef9fc8b -5904:e1eafd14eaa1 -5908:f8ee3c721251 -5921:ca808bb456b0 -5927:8b1001f9e3ac -5937:5a4ca1ad8c53 -5949:f3488f4e45b2 -5969:e09031dccfd9 -5992:4a429f5e0483 +#5745:37f59e65eb6c +5891:d8652709345d # introduce AVX +#5893:24b4dc92c6d3 # merge +5895:997c2ef9fc8b # introduce FMA +#5904:e1eafd14eaa1 # complex and AVX +5908:f8ee3c721251 # improve packing with ptranspose +#5921:ca808bb456b0 # merge +#5927:8b1001f9e3ac +5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks +#5949:f3488f4e45b2 # merge +#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec +#5992:4a429f5e0483 # merge before-evaluators -6334:f6a45e5b8b7c -6639:c9121c60b5c7 -6655:06f163b5221f -6677:700e023044e7 # FMA has been wrongly disabled -6681:11d31dafb0e3 -6699:5e6e8e10aad1 # merge default to tensors -6726:ff2d2388e7b9 # merge default to tensors -6742:0cbd6195e829 # merge default to tensors -6747:853d2bafeb8f # Generalized the gebp apis +#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products +#6639:c9121c60b5c7 +#6655:06f163b5221f # Properly detect FMA support on ARM +#6677:700e023044e7 # FMA has been wrongly disabled +#6681:11d31dafb0e3 +#6699:5e6e8e10aad1 # merge default to tensors +#6726:ff2d2388e7b9 # merge default to tensors +#6742:0cbd6195e829 # merge default to tensors +#6747:853d2bafeb8f # Generalized the gebp apis 6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation -6781:9cc5a931b2c6 # generalized gemv -6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product -6844:039efd86b75c # merge tensor +#6781:9cc5a931b2c6 # generalized gemv +#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product +#6844:039efd86b75c # merge tensor 6845:7333ed40c6ef # change prefetching in gebp -6856:b5be5e10eb7f # merge index conversion -6893:c3a64aba7c70 # clean blocking size computation -6898:6fb31ebe6492 # rotating kernel for ARM +#6856:b5be5e10eb7f # merge index conversion +#6893:c3a64aba7c70 # clean blocking size computation +#6898:6fb31ebe6492 # rotating kernel for ARM 6899:877facace746 # rotating kernel for ARM only -6904:c250623ae9fa # result_of +#6904:c250623ae9fa # result_of 6921:915f1b1fc158 # fix prefetching change for ARM 6923:9ff25f6dacc6 # prefetching -6933:52572e60b5d3 # blocking size strategy \ No newline at end of file +6933:52572e60b5d3 # blocking size strategy +6937:c8c042f286b2 # avoid redundant pack_rhs +6981:7e5d6f78da59 # dynamic loop swapping +6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh index d3a9fadc9..3fa6a3661 100755 --- a/bench/perf_monitoring/gemm/run_gemm.sh +++ b/bench/perf_monitoring/gemm/run_gemm.sh @@ -6,6 +6,7 @@ # Options: # -up : enforce the recomputation of existing data, and keep best results as a merging strategy +# -s : recompute selected changesets only and keep bests if echo "$*" | grep '\-up' > /dev/null; then @@ -14,14 +15,30 @@ else update=false fi -if [ $update == true ]; then +if echo "$*" | grep '\-s' > /dev/null; then + selected=true +else + selected=false +fi + +global_args="$*" + +if [ $selected == true ]; then + echo "Recompute selected changesets only and keep bests" +elif [ $update == true ]; then echo "(Re-)Compute all changesets and keep bests" else echo "Skip previously computed changesets" fi + + if [ ! -d "eigen_src" ]; then hg clone https://bitbucket.org/eigen/eigen eigen_src +else + cd eigen_src + hg pull -u + cd .. fi if [ ! -z '$CXX' ]; then @@ -61,17 +78,31 @@ function test_current scalar=$2 name=$3 - prev=`grep $rev "$name.backup" | cut -c 14-` + prev="" + if [ -e "$name.backup" ]; then + prev=`grep $rev "$name.backup" | cut -c 14-` + fi res=$prev count_rev=`echo $prev | wc -w` count_ref=`cat "settings.txt" | wc -l` - if [ $update == true ] || [ $count_rev != $count_ref ]; then + if echo "$global_args" | grep "$rev" > /dev/null; then + rev_found=true + else + rev_found=false + fi +# echo $update et $selected et $rev_found because $rev et "$global_args" +# echo $count_rev et $count_ref + if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] && [ $rev_found == true ]); then if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then curr=`./$name` - echo merge $prev - echo with $curr + if [ $count_rev == $count_ref ]; then + echo "merge previous $prev" + echo "with new $curr" + else + echo "got $curr" + fi res=`merge "$curr" "$prev"` - echo $res +# echo $res echo "$rev $res" >> $name.out else echo "Compilation failed, skip rev $rev" @@ -86,12 +117,12 @@ make_backup $PREFIX"sgemm" make_backup $PREFIX"dgemm" make_backup $PREFIX"cgemm" -cut -f1 -d"#" < changesets.txt | while read rev +cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev do if [ ! -z '$rev' ]; then echo "Testing rev $rev" cd eigen_src - hg up -C $rev + hg up -C $rev > /dev/null actual_rev=`hg identify | cut -f1 -d' '` cd .. diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt index 6ef690708..5c43e1c7d 100644 --- a/bench/perf_monitoring/gemm/settings.txt +++ b/bench/perf_monitoring/gemm/settings.txt @@ -1,5 +1,6 @@ 8 8 8 9 9 9 +24 24 24 239 239 239 240 240 240 2400 24 24 @@ -8,4 +9,7 @@ 24 2400 2400 2400 24 2400 2400 2400 24 +2400 2400 64 +4800 23 160 +23 4800 160 2400 2400 2400 From cd3bbffa73a7bc33e6c3f3a63ab2e9eed7cc322c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 14:31:39 +0100 Subject: [PATCH 34/83] Improve blocking heuristic: if the lhs fit within L1, then block on the rhs in L1 (allows to keep packed rhs in L1) --- .../Core/products/GeneralBlockPanelKernel.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 72957cdc8..cfeeb27f8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -213,8 +213,22 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k= Index(Traits::nr*sizeof(RhsScalar))*k) + { + // L1 blocking + max_nc = remaining_l1 / (k*sizeof(RhsScalar)); + } + else + { + // L2 blocking + max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); + } // WARNING Below, we assume that Traits::nr is a power of two. Index nc = std::min(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc) From 4c8eeeaed6e5fe3f62c06e743971cefb5eaff359 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 15:08:20 +0100 Subject: [PATCH 35/83] update gemm changeset list --- bench/benchmark-blocking-sizes.cpp | 2 +- bench/perf_monitoring/gemm/changesets.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 0bf9c07f8..465f5d2c8 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -199,7 +199,7 @@ void benchmark_t::run() double starttime = timer.getCpuTime(); for (int i = 0; i < iters_at_a_time; i++) { - dst[matrix_index] = lhs[matrix_index] * rhs[matrix_index]; + dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; matrix_index++; if (matrix_index == matrix_pool_size) { matrix_index = 0; diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index b379d7bd2..a5b63bc89 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -40,3 +40,4 @@ before-evaluators 6937:c8c042f286b2 # avoid redundant pack_rhs 6981:7e5d6f78da59 # dynamic loop swapping 6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache +6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. From 87681e508f1da5064f5b2699f17049306ac9a2ca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 16:11:22 +0100 Subject: [PATCH 36/83] bug #978: early return for vanishing products --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index c76f48154..7fd707ed7 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -457,6 +457,8 @@ struct generic_product_impl static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) { eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); + if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) + return; typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); From d23fcc0672d9edd917cfca0e32ea6d14300b492b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 16:12:08 +0100 Subject: [PATCH 37/83] bug #978: add unit test for zero-sized products --- test/product_extra.cpp | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/test/product_extra.cpp b/test/product_extra.cpp index 744a1ef7f..1b4c6c33c 100644 --- a/test/product_extra.cpp +++ b/test/product_extra.cpp @@ -109,8 +109,33 @@ void mat_mat_scalar_scalar_product() double det = 6.0, wt = 0.5; VERIFY_IS_APPROX(dNdxy.transpose()*dNdxy*det*wt, det*wt*dNdxy.transpose()*dNdxy); } + +template +void zero_sized_objects(const MatrixType& m) +{ + Index rows = m.rows(); + Index cols = m.cols(); -void zero_sized_objects() + { + MatrixType res, a(rows,0), b(0,cols); + VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(rows,cols) ); + VERIFY_IS_APPROX( (res=a*a.transpose()), MatrixType::Zero(rows,rows) ); + VERIFY_IS_APPROX( (res=b.transpose()*b), MatrixType::Zero(cols,cols) ); + VERIFY_IS_APPROX( (res=b.transpose()*a.transpose()), MatrixType::Zero(cols,rows) ); + } + + { + MatrixType res, a(rows,cols), b(cols,0); + res = a*b; + VERIFY(res.rows()==rows && res.cols()==0); + b.resize(0,rows); + res = b*a; + VERIFY(res.rows()==0 && res.cols()==cols); + } +} + + +void bug_127() { // Bug 127 // @@ -171,7 +196,8 @@ void test_product_extra() CALL_SUBTEST_2( mat_mat_scalar_scalar_product() ); CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); + CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } - CALL_SUBTEST_5( zero_sized_objects() ); + CALL_SUBTEST_5( bug_127() ); CALL_SUBTEST_6( unaligned_objects() ); } From 14a5f135a32eed85033a5061d96e2d45e23d50aa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 17:51:31 +0100 Subject: [PATCH 38/83] bug #969: workaround abiguous calls to Ref using enable_if. --- Eigen/src/Core/Ref.h | 16 +++++++++++----- test/ref.cpp | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 0cb117949..ea5a2bd5c 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -105,7 +105,8 @@ struct traits > OuterStrideMatch = Derived::IsVectorAtCompileTime || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits::Flags&AlignedBit)==AlignedBit), - MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch + ScalarTypeMatch = internal::is_same::value, + MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch }; typedef typename internal::conditional::type type; }; @@ -184,9 +185,11 @@ protected: template class Ref : public RefBase > { + private: typedef internal::traits Traits; template - EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase& expr); + EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase& expr, + typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0); public: typedef RefBase Base; @@ -195,13 +198,15 @@ template class Ref #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase& expr) + EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase& expr, + typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) { EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); Base::construct(expr.derived()); } template - EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr) + EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, + typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) #else template inline Ref(DenseBase& expr) @@ -228,7 +233,8 @@ template class Ref< EIGEN_DENSE_PUBLIC_INTERFACE(Ref) template - EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr) + EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, + typename internal::enable_if::ScalarTypeMatch),Derived>::type* = 0) { // std::cout << match_helper::HasDirectAccess << "," << match_helper::OuterStrideMatch << "," << match_helper::InnerStrideMatch << "\n"; // std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n"; diff --git a/test/ref.cpp b/test/ref.cpp index b9470213c..fbe2c450f 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -228,6 +228,28 @@ void call_ref() VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0); } +typedef Matrix RowMatrixXd; +int test_ref_overload_fun1(Ref ) { return 1; } +int test_ref_overload_fun1(Ref ) { return 2; } +int test_ref_overload_fun1(Ref ) { return 3; } + +int test_ref_overload_fun2(Ref ) { return 4; } +int test_ref_overload_fun2(Ref ) { return 5; } + +// See also bug 969 +void test_ref_overloads() +{ + MatrixXd Ad, Bd; + RowMatrixXd rAd, rBd; + VERIFY( test_ref_overload_fun1(Ad)==1 ); + VERIFY( test_ref_overload_fun1(rAd)==2 ); + + MatrixXf Af, Bf; + VERIFY( test_ref_overload_fun2(Ad)==4 ); + VERIFY( test_ref_overload_fun2(Ad+Bd)==4 ); + VERIFY( test_ref_overload_fun2(Af+Bf)==5 ); +} + void test_ref() { for(int i = 0; i < g_repeat; i++) { @@ -248,4 +270,6 @@ void test_ref() CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random(1,10),internal::random(1,10))) ); CALL_SUBTEST_6( call_ref() ); } + + CALL_SUBTEST_7( test_ref_overloads() ); } From 0ee391863ed8bee95356fad8b7202177290e0338 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 21:51:09 +0100 Subject: [PATCH 39/83] Avoid undeflow when blocking size are tuned manually. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cfeeb27f8..408281c82 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -975,7 +975,11 @@ void gebp_kernel(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); +#else + const Index actual_panel_rows = (3*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ); +#endif for(Index i1=0; i1=2*Traits::LhsProgress) { const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function. +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + Index actual_panel_rows = (2*LhsProgress) * std::max(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); +#else Index actual_panel_rows = (2*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ); +#endif for(Index i1=peeled_mc3; i1 Date: Fri, 6 Mar 2015 19:11:50 -0500 Subject: [PATCH 40/83] Automatically serialize partial results to disk, reboot, and resume, when timings are getting bad --- bench/benchmark-blocking-sizes.cpp | 147 ++++++++++++++++++++--------- 1 file changed, 105 insertions(+), 42 deletions(-) diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 316aa9d4d..e5932b297 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -13,6 +13,7 @@ #include #include #include +#include bool eigen_use_specific_block_size; int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; @@ -37,10 +38,12 @@ const int measurement_repetitions = 3; // Timings below this value are too short to be accurate, // we'll repeat measurements with more iterations until // we get a timing above that threshold. -const float g_min_accurate_time = 1e-2f; +const float min_accurate_time = 1e-2f; // See --min-working-set-size command line parameter. -size_t g_min_working_set_size = 0; +size_t min_working_set_size = 0; + +float max_clock_speed = 0.0f; // range of sizes that we will benchmark (in all 3 K,M,N dimensions) const size_t maxsize = 2048; @@ -97,24 +100,25 @@ struct benchmark_t uint16_t compact_block_size; bool use_default_block_size; float gflops; - size_t min_working_set_size; - float min_accurate_time; + benchmark_t() + : compact_product_size(0) + , compact_block_size(0) + , use_default_block_size(false) + , gflops(0) + { + } benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(compact_size_triple(bk, bm, bn)) , use_default_block_size(false) , gflops(0) - , min_working_set_size(g_min_working_set_size) - , min_accurate_time(g_min_accurate_time) {} benchmark_t(size_t pk, size_t pm, size_t pn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(0) , use_default_block_size(true) , gflops(0) - , min_working_set_size(g_min_working_set_size) - , min_accurate_time(g_min_accurate_time) {} void run(); @@ -296,11 +300,7 @@ float measure_clock_speed() vector all_gflops; for (int i = 0; i < 8; i++) { - // a good measure of clock speed is obtained by benchmarking small matrices that - // fit in L1 cache and use warm caches (min_working_set_size = 1). - benchmark_t b(128, 128, 128); - b.min_working_set_size = 1; - b.min_accurate_time = 0.1f; // long-running for better accuracy + benchmark_t b(1024, 1024, 1024); b.run(); all_gflops.push_back(b.gflops); } @@ -340,11 +340,52 @@ ostream& operator<<(ostream& s, const human_duration_t& d) return s; } +const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"; + +void serialize_benchmarks(const char* filename, const vector& benchmarks, size_t first_benchmark_to_run) +{ + FILE* file = fopen(filename, "w"); + if (!file) { + cerr << "Could not open file " << filename << " for writing." << endl; + cerr << "Do you have write permissions on the current working directory?" << endl; + exit(1); + } + size_t benchmarks_vector_size = benchmarks.size(); + fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file); + fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file); + fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file); + fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file); + fclose(file); +} + +bool deserialize_benchmarks(const char* filename, vector& benchmarks, size_t& first_benchmark_to_run) +{ + FILE* file = fopen(filename, "r"); + if (!file) { + return false; + } + if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) { + return false; + } + size_t benchmarks_vector_size = 0; + if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) { + return false; + } + if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) { + return false; + } + benchmarks.resize(benchmarks_vector_size); + if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) { + return false; + } + unlink(filename); + return true; +} + void try_run_some_benchmarks( vector& benchmarks, double time_start, - size_t& first_benchmark_to_run, - float& max_clock_speed) + size_t& first_benchmark_to_run) { if (first_benchmark_to_run == benchmarks.size()) { return; @@ -405,20 +446,12 @@ void try_run_some_benchmarks( unsigned int seconds_to_sleep_if_lower_clock_speed = 1; while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { - if (seconds_to_sleep_if_lower_clock_speed > 300) { - cerr << "Sleeping longer probably won't make a difference. Giving up." << endl; - cerr << "Things to try:" << endl; - cerr << " 1. Check if the device is in some energy-saving state." << endl; - cerr << " On Android, it may help to enable 'Stay Awake' in the dev settings." << endl; - cerr << " 2. Check if the device is overheating." << endl; - cerr << " On some devices, system temperature is reported in" << endl; - cerr << " /sys/class/thermal/thermal_zone*/temp" << endl; - cerr << " 3. Some system daemon might be playing with clock speeds." << endl; - cerr << " In particular, on Qualcomm devices, disable mpdecision " << endl; - cerr << " by renaming /system/bin/mpdecision and rebooting." << endl; - cerr << " 4. CPU frequency scaling might conceivably be the problem." << endl; - cerr << " In particular, Intel Turbo Boost. Try disabling that." << endl; - exit(1); + if (seconds_to_sleep_if_lower_clock_speed > 30) { + cerr << "Sleeping longer probably won't make a difference." << endl; + cerr << "Serializing benchmarks to " << session_filename << endl; + serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); + cerr << "Now restart this benchmark, and it should pick up where we left." << endl; + exit(2); } rerun_last_tests = true; cerr << "Sleeping " @@ -467,27 +500,57 @@ void try_run_some_benchmarks( void run_benchmarks(vector& benchmarks) { - // Randomly shuffling benchmarks allows us to get accurate enough progress info, - // as now the cheap/expensive benchmarks are randomly mixed so they average out. - // It also means that if data is corrupted for some time span, the odds are that - // not all repetitions of a given benchmark will be corrupted. - random_shuffle(benchmarks.begin(), benchmarks.end()); + size_t first_benchmark_to_run; + vector deserialized_benchmarks; + bool use_deserialized_benchmarks = false; + if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) { + cerr << "Found serialized session with " + << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size() + << " % already done" << endl; + if (deserialized_benchmarks.size() == benchmarks.size() && + first_benchmark_to_run > 0 && + first_benchmark_to_run < benchmarks.size()) + { + bool found_mismatch = false; + for (size_t i = 0; i < benchmarks.size(); i++) { + if (deserialized_benchmarks[i].compact_product_size != benchmarks[i].compact_product_size || + deserialized_benchmarks[i].compact_block_size != benchmarks[i].compact_block_size || + deserialized_benchmarks[i].use_default_block_size != benchmarks[i].use_default_block_size) + { + cerr << "Mismatch in serialized session. Ignoring it." << endl; + found_mismatch = true; + break; + } + } + use_deserialized_benchmarks = !found_mismatch; + } + } + + if (use_deserialized_benchmarks) { + benchmarks = deserialized_benchmarks; + } else { + // not using deserialized benchmarks, starting from scratch + first_benchmark_to_run = 0; + + // Randomly shuffling benchmarks allows us to get accurate enough progress info, + // as now the cheap/expensive benchmarks are randomly mixed so they average out. + // It also means that if data is corrupted for some time span, the odds are that + // not all repetitions of a given benchmark will be corrupted. + random_shuffle(benchmarks.begin(), benchmarks.end()); + } - float max_clock_speed = 0.0f; for (int i = 0; i < 4; i++) { max_clock_speed = max(max_clock_speed, measure_clock_speed()); } double time_start = 0.0; - size_t first_benchmark_to_run = 0; while (first_benchmark_to_run < benchmarks.size()) { if (first_benchmark_to_run == 0) { time_start = timer.getRealTime(); } try_run_some_benchmarks(benchmarks, time_start, - first_benchmark_to_run, - max_clock_speed); + first_benchmark_to_run); } // Sort timings by increasing benchmark parameters, and decreasing gflops. @@ -595,7 +658,7 @@ int main(int argc, char* argv[]) for (int i = 2; i < argc; i++) { if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { const char* equals_sign = strchr(argv[i], '='); - g_min_working_set_size = strtoul(equals_sign+1, nullptr, 10); + min_working_set_size = strtoul(equals_sign+1, nullptr, 10); } else { cerr << "unrecognized option: " << argv[i] << endl << endl; show_usage_and_exit(argc, argv, available_actions); @@ -611,9 +674,9 @@ int main(int argc, char* argv[]) cout << "minsize = " << minsize << endl; cout << "maxsize = " << maxsize << endl; cout << "measurement_repetitions = " << measurement_repetitions << endl; - cout << "g_min_accurate_time = " << g_min_accurate_time << endl; - cout << "g_min_working_set_size = " << g_min_working_set_size; - if (g_min_working_set_size == 0) { + cout << "min_accurate_time = " << min_accurate_time << endl; + cout << "min_working_set_size = " << min_working_set_size; + if (min_working_set_size == 0) { cout << " (try to outsize caches)"; } cout << endl << endl; From 39228cb22414209e1880d84699b5f8e481c2b436 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 6 Mar 2015 19:29:01 -0500 Subject: [PATCH 41/83] deserialization assumed benchmarks in same order, but we shuffle them. --- bench/benchmark-blocking-sizes.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 83e973a69..33d3824eb 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -511,18 +511,7 @@ void run_benchmarks(vector& benchmarks) first_benchmark_to_run > 0 && first_benchmark_to_run < benchmarks.size()) { - bool found_mismatch = false; - for (size_t i = 0; i < benchmarks.size(); i++) { - if (deserialized_benchmarks[i].compact_product_size != benchmarks[i].compact_product_size || - deserialized_benchmarks[i].compact_block_size != benchmarks[i].compact_block_size || - deserialized_benchmarks[i].use_default_block_size != benchmarks[i].use_default_block_size) - { - cerr << "Mismatch in serialized session. Ignoring it." << endl; - found_mismatch = true; - break; - } - } - use_deserialized_benchmarks = !found_mismatch; + use_deserialized_benchmarks = true; } } From cf9940e17b5799c35b6a05764bec166d07906b6f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Mar 2015 13:54:05 +0100 Subject: [PATCH 42/83] Make sparse unit-test helpers aware of StorageIndex --- test/sparse.h | 14 +++++++------- test/sparse_solver.h | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/sparse.h b/test/sparse.h index 81ab9e873..3c3a0c9be 100644 --- a/test/sparse.h +++ b/test/sparse.h @@ -53,15 +53,15 @@ enum { * \param zeroCoords and nonzeroCoords allows to get the coordinate lists of the non zero, * and zero coefficients respectively. */ -template void +template void initSparse(double density, Matrix& refMat, - SparseMatrix& sparseMat, + SparseMatrix& sparseMat, int flags = 0, - std::vector >* zeroCoords = 0, - std::vector >* nonzeroCoords = 0) + std::vector >* zeroCoords = 0, + std::vector >* nonzeroCoords = 0) { - enum { IsRowMajor = SparseMatrix::IsRowMajor }; + enum { IsRowMajor = SparseMatrix::IsRowMajor }; sparseMat.setZero(); //sparseMat.reserve(int(refMat.rows()*refMat.cols()*density)); sparseMat.reserve(VectorXi::Constant(IsRowMajor ? refMat.rows() : refMat.cols(), int((1.5*density)*(IsRowMajor?refMat.cols():refMat.rows())))); @@ -93,11 +93,11 @@ initSparse(double density, //sparseMat.insertBackByOuterInner(j,i) = v; sparseMat.insertByOuterInner(j,i) = v; if (nonzeroCoords) - nonzeroCoords->push_back(Matrix (ai,aj)); + nonzeroCoords->push_back(Matrix (ai,aj)); } else if (zeroCoords) { - zeroCoords->push_back(Matrix (ai,aj)); + zeroCoords->push_back(Matrix (ai,aj)); } refMat(ai,aj) = v; } diff --git a/test/sparse_solver.h b/test/sparse_solver.h index f266e2c9a..a078851c3 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -230,7 +230,7 @@ template void check_sparse_spd_solving(Solver& solver) { typedef typename Solver::MatrixType Mat; typedef typename Mat::Scalar Scalar; - typedef SparseMatrix SpMat; + typedef SparseMatrix SpMat; typedef Matrix DenseMatrix; typedef Matrix DenseVector; @@ -304,12 +304,12 @@ template void check_sparse_spd_determinant(Solver& solver) } template -int generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag) +Index generate_sparse_square_problem(Solver&, typename Solver::MatrixType& A, DenseMat& dA, int maxSize = 300, int options = ForceNonZeroDiag) { typedef typename Solver::MatrixType Mat; typedef typename Mat::Scalar Scalar; - int size = internal::random(1,maxSize); + Index size = internal::random(1,maxSize); double density = (std::max)(8./(size*size), 0.01); A.resize(size,size); @@ -324,7 +324,7 @@ template void check_sparse_square_solving(Solver& solver) { typedef typename Solver::MatrixType Mat; typedef typename Mat::Scalar Scalar; - typedef SparseMatrix SpMat; + typedef SparseMatrix SpMat; typedef Matrix DenseMatrix; typedef Matrix DenseVector; @@ -333,7 +333,7 @@ template void check_sparse_square_solving(Solver& solver) Mat A; DenseMatrix dA; for (int i = 0; i < g_repeat; i++) { - int size = generate_sparse_square_problem(solver, A, dA); + Index size = generate_sparse_square_problem(solver, A, dA); A.makeCompressed(); DenseVector b = DenseVector::Random(size); @@ -430,7 +430,7 @@ template void check_sparse_leastsquare_solving(Solver& solver) { typedef typename Solver::MatrixType Mat; typedef typename Mat::Scalar Scalar; - typedef SparseMatrix SpMat; + typedef SparseMatrix SpMat; typedef Matrix DenseMatrix; typedef Matrix DenseVector; From 224a1fe4c6991c863986d8c8bd3d41af5aa4ff80 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Mar 2015 13:55:20 +0100 Subject: [PATCH 43/83] bug #963: make IncompleteLUT compatible with non-default storage index types. --- .../IterativeLinearSolvers/IncompleteLUT.h | 41 ++++++++++--------- test/bicgstab.cpp | 13 +++--- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 6d63d45e4..b7f8debb3 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -93,21 +93,23 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) * alternatively, on GMANE: * http://comments.gmane.org/gmane.comp.lib.eigen/3302 */ -template -class IncompleteLUT : public SparseSolverBase > +template +class IncompleteLUT : public SparseSolverBase > { protected: - typedef SparseSolverBase > Base; + typedef SparseSolverBase Base; using Base::m_isInitialized; public: typedef _Scalar Scalar; + typedef _StorageIndex StorageIndex; typedef typename NumTraits::Real RealScalar; typedef Matrix Vector; - typedef SparseMatrix FactorType; - typedef SparseMatrix PermutType; - typedef typename FactorType::StorageIndex StorageIndex; + typedef Matrix VectorI; + typedef SparseMatrix FactorType; public: + + // this typedef is only to export the scalar type and compile-time dimensions to solve_retval typedef Matrix MatrixType; IncompleteLUT() @@ -151,7 +153,7 @@ class IncompleteLUT : public SparseSolverBase > * **/ template - IncompleteLUT& compute(const MatrixType& amat) + IncompleteLUT& compute(const MatrixType& amat) { analyzePattern(amat); factorize(amat); @@ -197,8 +199,8 @@ protected: * Set control parameter droptol * \param droptol Drop any element whose magnitude is less than this tolerance **/ -template -void IncompleteLUT::setDroptol(const RealScalar& droptol) +template +void IncompleteLUT::setDroptol(const RealScalar& droptol) { this->m_droptol = droptol; } @@ -207,15 +209,15 @@ void IncompleteLUT::setDroptol(const RealScalar& droptol) * Set control parameter fillfactor * \param fillfactor This is used to compute the number @p fill_in of largest elements to keep on each row. **/ -template -void IncompleteLUT::setFillfactor(int fillfactor) +template +void IncompleteLUT::setFillfactor(int fillfactor) { this->m_fillfactor = fillfactor; } -template +template template -void IncompleteLUT::analyzePattern(const _MatrixType& amat) +void IncompleteLUT::analyzePattern(const _MatrixType& amat) { // Compute the Fill-reducing permutation SparseMatrix mat1 = amat; @@ -232,9 +234,9 @@ void IncompleteLUT::analyzePattern(const _MatrixType& amat) m_analysisIsOk = true; } -template +template template -void IncompleteLUT::factorize(const _MatrixType& amat) +void IncompleteLUT::factorize(const _MatrixType& amat) { using std::sqrt; using std::swap; @@ -246,8 +248,8 @@ void IncompleteLUT::factorize(const _MatrixType& amat) m_lu.resize(n,n); // Declare Working vectors and variables Vector u(n) ; // real values of the row -- maximum size is n -- - VectorXi ju(n); // column position of the values in u -- maximum size is n - VectorXi jr(n); // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1 + VectorI ju(n); // column position of the values in u -- maximum size is n + VectorI jr(n); // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1 // Apply the fill-reducing permutation eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); @@ -398,7 +400,7 @@ void IncompleteLUT::factorize(const _MatrixType& amat) sizel = len; len = (std::min)(sizel, nnzL); typename Vector::SegmentReturnType ul(u.segment(0, sizel)); - typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel)); + typename VectorI::SegmentReturnType jul(ju.segment(0, sizel)); internal::QuickSplit(ul, jul, len); // store the largest m_fill elements of the L part @@ -427,14 +429,13 @@ void IncompleteLUT::factorize(const _MatrixType& amat) sizeu = len + 1; // +1 to take into account the diagonal element len = (std::min)(sizeu, nnzU); typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1)); - typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1)); + typename VectorI::SegmentReturnType juu(ju.segment(ii+1, sizeu-1)); internal::QuickSplit(uu, juu, len); // store the largest elements of the U part for(Index k = ii + 1; k < ii + len; k++) m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k); } - m_lu.finalize(); m_lu.makeCompressed(); diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp index f327e2fac..6d76389ce 100644 --- a/test/bicgstab.cpp +++ b/test/bicgstab.cpp @@ -10,11 +10,11 @@ #include "sparse_solver.h" #include -template void test_bicgstab_T() +template void test_bicgstab_T() { - BiCGSTAB, DiagonalPreconditioner > bicgstab_colmajor_diag; - BiCGSTAB, IdentityPreconditioner > bicgstab_colmajor_I; - BiCGSTAB, IncompleteLUT > bicgstab_colmajor_ilut; + BiCGSTAB, DiagonalPreconditioner > bicgstab_colmajor_diag; + BiCGSTAB, IdentityPreconditioner > bicgstab_colmajor_I; + BiCGSTAB, IncompleteLUT > bicgstab_colmajor_ilut; //BiCGSTAB, SSORPreconditioner > bicgstab_colmajor_ssor; CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_diag) ); @@ -25,6 +25,7 @@ template void test_bicgstab_T() void test_bicgstab() { - CALL_SUBTEST_1(test_bicgstab_T()); - CALL_SUBTEST_2(test_bicgstab_T >()); + CALL_SUBTEST_1((test_bicgstab_T()) ); + CALL_SUBTEST_1((test_bicgstab_T())); + CALL_SUBTEST_2((test_bicgstab_T, int>())); } From 9e885fb76608c8d90da20151db6459a99be360d6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Mar 2015 14:33:15 +0100 Subject: [PATCH 44/83] Add unit tests for CG and sparse-LLT for long int as storage-index --- Eigen/src/OrderingMethods/Ordering.h | 4 ++-- Eigen/src/SparseCholesky/SimplicialCholesky.h | 5 ++-- test/bicgstab.cpp | 2 +- test/conjugate_gradient.cpp | 18 +++++++------- test/simplicial_cholesky.cpp | 24 ++++++++++--------- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index e88e637a4..cb838d04a 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -90,11 +90,11 @@ class AMDOrdering * \note Returns an empty permutation matrix * \tparam Index The type of indices of the matrix */ -template +template class NaturalOrdering { public: - typedef PermutationMatrix PermutationType; + typedef PermutationMatrix PermutationType; /** Compute the permutation vector from a column-major sparse matrix */ template diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index a0815e708..f56298e8c 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -69,6 +69,7 @@ class SimplicialCholeskyBase : public SparseSolverBase typedef SparseMatrix CholMatrixType; typedef CholMatrixType const * ConstCholMatrixPtr; typedef Matrix VectorType; + typedef Matrix VectorI; public: @@ -250,8 +251,8 @@ class SimplicialCholeskyBase : public SparseSolverBase CholMatrixType m_matrix; VectorType m_diag; // the diagonal coefficients (LDLT mode) - VectorXi m_parent; // elimination tree - VectorXi m_nonZerosPerCol; + VectorI m_parent; // elimination tree + VectorI m_nonZerosPerCol; PermutationMatrix m_P; // the permutation PermutationMatrix m_Pinv; // the inverse permutation diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp index 6d76389ce..7a9a11330 100644 --- a/test/bicgstab.cpp +++ b/test/bicgstab.cpp @@ -26,6 +26,6 @@ template void test_bicgstab_T() void test_bicgstab() { CALL_SUBTEST_1((test_bicgstab_T()) ); - CALL_SUBTEST_1((test_bicgstab_T())); CALL_SUBTEST_2((test_bicgstab_T, int>())); + CALL_SUBTEST_3((test_bicgstab_T())); } diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 019cc4d64..9622fd86d 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -10,13 +10,14 @@ #include "sparse_solver.h" #include -template void test_conjugate_gradient_T() +template void test_conjugate_gradient_T() { - ConjugateGradient, Lower > cg_colmajor_lower_diag; - ConjugateGradient, Upper > cg_colmajor_upper_diag; - ConjugateGradient, Lower|Upper> cg_colmajor_loup_diag; - ConjugateGradient, Lower, IdentityPreconditioner> cg_colmajor_lower_I; - ConjugateGradient, Upper, IdentityPreconditioner> cg_colmajor_upper_I; + typedef SparseMatrix SparseMatrixType; + ConjugateGradient cg_colmajor_lower_diag; + ConjugateGradient cg_colmajor_upper_diag; + ConjugateGradient cg_colmajor_loup_diag; + ConjugateGradient cg_colmajor_lower_I; + ConjugateGradient cg_colmajor_upper_I; CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag) ); @@ -27,6 +28,7 @@ template void test_conjugate_gradient_T() void test_conjugate_gradient() { - CALL_SUBTEST_1(test_conjugate_gradient_T()); - CALL_SUBTEST_2(test_conjugate_gradient_T >()); + CALL_SUBTEST_1(( test_conjugate_gradient_T() )); + CALL_SUBTEST_2(( test_conjugate_gradient_T, int>() )); + CALL_SUBTEST_3(( test_conjugate_gradient_T() )); } diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp index 786468421..b7cc2d351 100644 --- a/test/simplicial_cholesky.cpp +++ b/test/simplicial_cholesky.cpp @@ -9,16 +9,17 @@ #include "sparse_solver.h" -template void test_simplicial_cholesky_T() +template void test_simplicial_cholesky_T() { - SimplicialCholesky, Lower> chol_colmajor_lower_amd; - SimplicialCholesky, Upper> chol_colmajor_upper_amd; - SimplicialLLT, Lower> llt_colmajor_lower_amd; - SimplicialLLT, Upper> llt_colmajor_upper_amd; - SimplicialLDLT, Lower> ldlt_colmajor_lower_amd; - SimplicialLDLT, Upper> ldlt_colmajor_upper_amd; - SimplicialLDLT, Lower, NaturalOrdering > ldlt_colmajor_lower_nat; - SimplicialLDLT, Upper, NaturalOrdering > ldlt_colmajor_upper_nat; + typedef SparseMatrix SparseMatrixType; + SimplicialCholesky chol_colmajor_lower_amd; + SimplicialCholesky chol_colmajor_upper_amd; + SimplicialLLT< SparseMatrixType, Lower> llt_colmajor_lower_amd; + SimplicialLLT< SparseMatrixType, Upper> llt_colmajor_upper_amd; + SimplicialLDLT< SparseMatrixType, Lower> ldlt_colmajor_lower_amd; + SimplicialLDLT< SparseMatrixType, Upper> ldlt_colmajor_upper_amd; + SimplicialLDLT< SparseMatrixType, Lower, NaturalOrdering > ldlt_colmajor_lower_nat; + SimplicialLDLT< SparseMatrixType, Upper, NaturalOrdering > ldlt_colmajor_upper_nat; check_sparse_spd_solving(chol_colmajor_lower_amd); check_sparse_spd_solving(chol_colmajor_upper_amd); @@ -40,6 +41,7 @@ template void test_simplicial_cholesky_T() void test_simplicial_cholesky() { - CALL_SUBTEST_1(test_simplicial_cholesky_T()); - CALL_SUBTEST_2(test_simplicial_cholesky_T >()); + CALL_SUBTEST_1(( test_simplicial_cholesky_T() )); + CALL_SUBTEST_2(( test_simplicial_cholesky_T, int>() )); + CALL_SUBTEST_3(( test_simplicial_cholesky_T() )); } From d4317a85e8266fd9f840986821a2ca1cf673262b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Mar 2015 21:29:46 +0100 Subject: [PATCH 45/83] Add typedefs for return types of SparseMatrixBase::selfadjointView --- Eigen/src/SparseCore/SparseMatrixBase.h | 10 +++++++--- Eigen/src/SparseCore/SparseSelfAdjointView.h | 12 ++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index d76dfa33d..55b0ad9d2 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -97,7 +97,6 @@ template class SparseMatrixBase : public EigenBase Transpose >::type AdjointReturnType; typedef Transpose TransposeReturnType; - template struct SelfAdjointViewReturnType { typedef SelfAdjointView Type; }; typedef typename internal::add_const >::type ConstTransposeReturnType; // FIXME storage order do not match evaluator storage order @@ -300,9 +299,14 @@ template class SparseMatrixBase : public EigenBase template inline const TriangularView triangularView() const; + + template struct SelfAdjointViewReturnType { typedef SparseSelfAdjointView Type; }; + template struct ConstSelfAdjointViewReturnType { typedef const SparseSelfAdjointView Type; }; - template inline const SparseSelfAdjointView selfadjointView() const; - template inline SparseSelfAdjointView selfadjointView(); + template inline + typename ConstSelfAdjointViewReturnType::Type selfadjointView() const; + template inline + typename SelfAdjointViewReturnType::Type selfadjointView(); template Scalar dot(const MatrixBase& other) const; template Scalar dot(const SparseMatrixBase& other) const; diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 6467d4894..3da856799 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -169,17 +169,17 @@ template class SparseSelfAdjointView ***************************************************************************/ template -template -const SparseSelfAdjointView SparseMatrixBase::selfadjointView() const +template +typename SparseMatrixBase::template ConstSelfAdjointViewReturnType::Type SparseMatrixBase::selfadjointView() const { - return SparseSelfAdjointView(derived()); + return SparseSelfAdjointView(derived()); } template -template -SparseSelfAdjointView SparseMatrixBase::selfadjointView() +template +typename SparseMatrixBase::template SelfAdjointViewReturnType::Type SparseMatrixBase::selfadjointView() { - return SparseSelfAdjointView(derived()); + return SparseSelfAdjointView(derived()); } /*************************************************************************** From fd788748889f50536f590b68dfa98db0044e5115 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Mar 2015 21:31:03 +0100 Subject: [PATCH 46/83] Fix compilation of iterative solvers with dense matrices --- .../ConjugateGradient.h | 2 +- .../IterativeSolverBase.h | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 11b8347f7..9e7dd1404 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -185,7 +185,7 @@ public: { typedef typename internal::conditional&, - SparseSelfAdjointView, UpLo> + typename Ref::template ConstSelfAdjointViewReturnType::Type >::type MatrixWrapperType; m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 46bc0ac78..6477b9de2 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -52,9 +52,9 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - template - explicit IterativeSolverBase(const SparseMatrixBase& A) - : mp_matrix(A) + template + explicit IterativeSolverBase(const EigenBase& A) + : mp_matrix(A.derived()) { init(); compute(mp_matrix); @@ -67,8 +67,8 @@ public: * Currently, this function mostly calls analyzePattern on the preconditioner. In the future * we might, for instance, implement column reordering for faster matrix vector products. */ - template - Derived& analyzePattern(const SparseMatrixBase& A) + template + Derived& analyzePattern(const EigenBase& A) { grab(A.derived()); m_preconditioner.analyzePattern(mp_matrix); @@ -87,8 +87,8 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - template - Derived& factorize(const SparseMatrixBase& A) + template + Derived& factorize(const EigenBase& A) { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); grab(A.derived()); @@ -108,8 +108,8 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - template - Derived& compute(const SparseMatrixBase& A) + template + Derived& compute(const EigenBase& A) { grab(A.derived()); m_preconditioner.compute(mp_matrix); @@ -223,11 +223,11 @@ protected: m_tolerance = NumTraits::epsilon(); } - template - void grab(const SparseMatrixBase &A) + template + void grab(const EigenBase &A) { mp_matrix.~Ref(); - ::new (&mp_matrix) Ref(A); + ::new (&mp_matrix) Ref(A.derived()); } void grab(const Ref &A) From f2c3e2b10fbc15fbcd3d5a24def771cbd7549d8c Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Thu, 12 Mar 2015 13:16:33 -0700 Subject: [PATCH 47/83] Add --only-cubic-sizes option to analyze-blocking-sizes tool --- bench/analyze-blocking-sizes.cpp | 81 +++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index 7aae57d93..cf8236a82 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -25,6 +25,9 @@ using namespace std; const int default_precision = 4; +// see --only-cubic-sizes +bool only_cubic_sizes = false; + uint8_t log2_pot(size_t x) { size_t l = 0; while (x >>= 1) l++; @@ -130,6 +133,9 @@ struct inputfile_t cerr << "offending line:" << endl << line << endl; exit(1); } + if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) { + continue; + } inputfile_entry_t entry; entry.product_size = uint16_t(product_size); entry.pot_block_size = uint16_t(block_size); @@ -155,6 +161,9 @@ struct inputfile_t cerr << "offending line:" << endl << line << endl; exit(1); } + if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) { + continue; + } inputfile_entry_t entry; entry.product_size = uint16_t(product_size); entry.pot_block_size = 0; @@ -505,28 +514,23 @@ void print_partition( struct action_t { virtual const char* invokation_name() const { abort(); return nullptr; } - virtual void run(int, char*[]) const { abort(); } + virtual void run(const vector&) const { abort(); } virtual ~action_t() {} }; struct partition_action_t : action_t { - virtual const char* invokation_name() const { return "partition"; } - virtual void run(int argc, char *argv[]) const + virtual const char* invokation_name() const override { return "partition"; } + virtual void run(const vector& input_filenames) const override { vector preprocessed_inputfiles; - if (!argc) { + if (input_filenames.empty()) { cerr << "The " << invokation_name() << " action needs a list of input files." << endl; exit(1); } - vector inputfilenames; - for (int i = 0; i < argc; i++) { - inputfilenames.emplace_back(argv[i]); - } - - for (auto it = inputfilenames.begin(); it != inputfilenames.end(); ++it) { + for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) { inputfile_t inputfile(*it); switch (inputfile.type) { case inputfile_t::type_t::all_pot_sizes: @@ -610,7 +614,7 @@ struct evaluate_defaults_action_t : action_t static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) { return e1.default_efficiency < e2.default_efficiency; } - virtual const char* invokation_name() const { return "evaluate-defaults"; } + virtual const char* invokation_name() const override { return "evaluate-defaults"; } void show_usage_and_exit() const { cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl; @@ -618,13 +622,13 @@ struct evaluate_defaults_action_t : action_t << "performance measured over all POT sizes." << endl; exit(1); } - virtual void run(int argc, char *argv[]) const + virtual void run(const vector& input_filenames) const override { - if (argc != 2) { + if (input_filenames.size() != 2) { show_usage_and_exit(); } - inputfile_t inputfile_default_sizes(argv[0]); - inputfile_t inputfile_all_pot_sizes(argv[1]); + inputfile_t inputfile_default_sizes(input_filenames[0]); + inputfile_t inputfile_all_pot_sizes(input_filenames[1]); if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) { cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl; show_usage_and_exit(); @@ -719,7 +723,7 @@ struct evaluate_defaults_action_t : action_t void show_usage_and_exit(int argc, char* argv[], const vector>& available_actions) { - cerr << "usage: " << argv[0] << " " << endl; + cerr << "usage: " << argv[0] << " [options...] " << endl; cerr << "available actions:" << endl; for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { cerr << " " << (*it)->invokation_name() << endl; @@ -737,21 +741,52 @@ int main(int argc, char* argv[]) available_actions.emplace_back(new partition_action_t); available_actions.emplace_back(new evaluate_defaults_action_t); - auto action = available_actions.end(); + vector input_filenames; + + action_t* action = nullptr; if (argc < 2) { show_usage_and_exit(argc, argv, available_actions); } - for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { - if (!strcmp(argv[1], (*it)->invokation_name())) { - action = it; - break; + for (int i = 1; i < argc; i++) { + bool arg_handled = false; + // Step 1. Try to match action invokation names. + for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { + if (!strcmp(argv[i], (*it)->invokation_name())) { + if (!action) { + action = it->get(); + arg_handled = true; + break; + } else { + cerr << "can't specify more than one action!" << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } } + if (arg_handled) { + continue; + } + // Step 2. Try to match option names. + if (argv[i][0] == '-') { + if (!strcmp(argv[i], "--only-cubic-sizes")) { + only_cubic_sizes = true; + arg_handled = true; + } + if (!arg_handled) { + cerr << "Unrecognized option: " << argv[i] << endl; + show_usage_and_exit(argc, argv, available_actions); + } + } + if (arg_handled) { + continue; + } + // Step 3. Default to interpreting args as input filenames. + input_filenames.emplace_back(argv[i]); } - if (action == available_actions.end()) { + if (!action) { show_usage_and_exit(argc, argv, available_actions); } - (*action)->run(argc - 2, argv + 2); + action->run(input_filenames); } From 2f6f8bf31c781347e48183d3681978eff6ddbede Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 13 Mar 2015 16:24:40 +0100 Subject: [PATCH 48/83] Add missing coeff/coeffRef members to Block, and extend unit tests. --- Eigen/src/SparseCore/SparseBlock.h | 25 ++++++++++ test/sparse_basic.cpp | 78 +++++++++++++++++++++++------- 2 files changed, 86 insertions(+), 17 deletions(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index acd82e926..2b31716a3 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -49,6 +49,16 @@ public: return nnz; } + inline const Scalar coeff(Index row, Index col) const + { + return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); + } + + inline const Scalar coeff(Index index) const + { + return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index : m_outerStart); + } + inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; } Index startRow() const { return IsRowMajor ? m_outerStart : 0; } Index startCol() const { return IsRowMajor ? 0 : m_outerStart; } @@ -204,6 +214,21 @@ public: } bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; } + + inline Scalar& coeffRef(Index row, Index col) + { + return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); + } + + inline const Scalar coeff(Index row, Index col) const + { + return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart)); + } + + inline const Scalar coeff(Index index) const + { + return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index : m_outerStart); + } const Scalar& lastCoeff() const { diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index e243964f4..d929e1463 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -30,6 +30,7 @@ template void sparse_basic(const SparseMatrixType& re double density = (std::max)(8./(rows*cols), 0.01); typedef Matrix DenseMatrix; typedef Matrix DenseVector; + typedef Matrix RowDenseVector; Scalar eps = 1e-6; Scalar s1 = internal::random(); @@ -59,32 +60,61 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(m, refMat); // test InnerIterators and Block expressions - for (Index t=0; t<10; ++t) + for (int t=0; t<10; ++t) { - Index j = internal::random(0,cols-1); - Index i = internal::random(0,rows-1); - Index w = internal::random(1,cols-j-1); - Index h = internal::random(1,rows-i-1); + Index j = internal::random(0,cols-2); + Index i = internal::random(0,rows-2); + Index w = internal::random(1,cols-j); + Index h = internal::random(1,rows-i); - VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w)); + VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w)); for(Index c=0; c void sparse_basic(const SparseMatrixType& re refMat2.col(i) = refMat2.col(i) * s1; VERIFY_IS_APPROX(m2,refMat2); } + + Index r0 = internal::random(0,rows-2); + Index c0 = internal::random(0,cols-2); + Index r1 = internal::random(1,rows-r0); + Index c1 = internal::random(1,cols-c0); + + VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0)); + VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0)); + + VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0)); + VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0)); + + VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1)); + VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1)); } // test prune From d73ccd717e000d0a91293db2b24c402e49c907ff Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 13 Mar 2015 10:36:01 -0700 Subject: [PATCH 49/83] Add support for dumping blocking sizes tables --- bench/analyze-blocking-sizes.cpp | 75 +++++++++++++++++++++++++++++- bench/benchmark-blocking-sizes.cpp | 4 +- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index cf8236a82..c133df599 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -28,6 +28,9 @@ const int default_precision = 4; // see --only-cubic-sizes bool only_cubic_sizes = false; +// see --dump-tables +bool dump_tables = false; + uint8_t log2_pot(size_t x) { size_t l = 0; while (x >>= 1) l++; @@ -318,14 +321,74 @@ float efficiency_of_subset( efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry); } efficiency = min(efficiency, efficiency_this_product_size); - first_entry_index_with_this_product_size = entry_index; - product_size = first_file.entries[entry_index].product_size; + if (entry_index < num_entries) { + first_entry_index_with_this_product_size = entry_index; + product_size = first_file.entries[entry_index].product_size; + } } } return efficiency; } +void dump_table_for_subset( + const vector& preprocessed_inputfiles, + const vector& subset) +{ + const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]]; + const size_t num_entries = first_file.entries.size(); + size_t entry_index = 0; + size_t first_entry_index_with_this_product_size = 0; + uint16_t product_size = first_file.entries[0].product_size; + size_t i = 0; + size_triple_t min_product_size(first_file.entries.front().product_size); + size_triple_t max_product_size(first_file.entries.back().product_size); + if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) { + abort(); + } + if (only_cubic_sizes) { + cout << "/* Warning: generated with --only-cubic-sizes ! */" << endl; + } + cout << "struct optimal_block_sizes_table {" << endl; + cout << " static const size_t min_size = " << min_product_size.k << ";" << endl; + cout << " static const size_t max_size = " << max_product_size.k << ";" << endl; + cout << " static const uint16_t* table() {" << endl; + cout << " static const uint16_t data[] = {"; + while (entry_index < num_entries) { + ++entry_index; + if (entry_index == num_entries || + first_file.entries[entry_index].product_size != product_size) + { + float best_efficiency_this_product_size = 0.0f; + uint16_t best_block_size_this_product_size = 0; + for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) { + float efficiency_this_entry = 1.0f; + for (auto i = subset.begin(); i != subset.end(); ++i) { + efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency); + } + if (efficiency_this_entry > best_efficiency_this_product_size) { + best_efficiency_this_product_size = efficiency_this_entry; + best_block_size_this_product_size = first_file.entries[e].block_size; + } + } + if ((i++) % 8) { + cout << ", "; + } else { + cout << endl << " "; + } + cout << "0x" << hex << best_block_size_this_product_size << dec; + if (entry_index < num_entries) { + first_entry_index_with_this_product_size = entry_index; + product_size = first_file.entries[entry_index].product_size; + } + } + } + cout << endl << " };" << endl; + cout << " return data;" << endl; + cout << " }" << endl; + cout << "};" << endl; +} + float efficiency_of_partition( const vector& preprocessed_inputfiles, const vector>& partition) @@ -507,6 +570,10 @@ void print_partition( for (auto file = subset->begin(); file != subset->end(); ++file) { cout << " " << preprocessed_inputfiles[*file].filename << endl; } + if (dump_tables) { + cout << " Table:" << endl; + dump_table_for_subset(preprocessed_inputfiles, *subset); + } } cout << endl; } @@ -772,6 +839,10 @@ int main(int argc, char* argv[]) only_cubic_sizes = true; arg_handled = true; } + if (!strcmp(argv[i], "--dump-tables")) { + dump_tables = true; + arg_handled = true; + } if (!arg_handled) { cerr << "Unrecognized option: " << argv[i] << endl; show_usage_and_exit(argc, argv, available_actions); diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 33d3824eb..827be2880 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -446,7 +446,7 @@ void try_run_some_benchmarks( unsigned int seconds_to_sleep_if_lower_clock_speed = 1; while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { - if (seconds_to_sleep_if_lower_clock_speed > 30) { + if (seconds_to_sleep_if_lower_clock_speed > 32) { cerr << "Sleeping longer probably won't make a difference." << endl; cerr << "Serializing benchmarks to " << session_filename << endl; serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); @@ -456,7 +456,7 @@ void try_run_some_benchmarks( rerun_last_tests = true; cerr << "Sleeping " << seconds_to_sleep_if_lower_clock_speed - << " s..." << endl; + << " s... \r" << endl; sleep(seconds_to_sleep_if_lower_clock_speed); current_clock_speed = measure_clock_speed(); seconds_to_sleep_if_lower_clock_speed *= 2; From 5ffe29cb9f260f35f1ae0f878885a77d42a29d8b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 13 Mar 2015 20:57:33 +0100 Subject: [PATCH 50/83] Bound pre-allocation to the maximal size representable by StorageIndex and throw bad_alloc if that's not possible. --- Eigen/src/SparseCore/CompressedStorage.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index 49fd46658..52c7da297 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -86,7 +86,12 @@ class CompressedStorage void resize(Index size, double reserveSizeFactor = 0) { if (m_allocatedSize)(NumTraits::highest(), size + Index(reserveSizeFactor*double(size))); + if(realloc_size Date: Fri, 13 Mar 2015 21:00:21 +0100 Subject: [PATCH 51/83] SparseMatrix::insert: switch to a fully uncompressed mode if sequential insertion is not possible (otherwise an arbitrary large amount of memory was preallocated in some cases) --- Eigen/src/SparseCore/SparseMatrix.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 0ba7e111a..4c3a47959 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -1172,8 +1172,12 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op return (m_data.value(p) = 0); } - // make sure the matrix is compatible to random un-compressed insertion: - m_data.resize(m_data.allocatedSize()); + if(m_data.size() != m_data.allocatedSize()) + { + // make sure the matrix is compatible to random un-compressed insertion: + m_data.resize(m_data.allocatedSize()); + this->reserveInnerVectors(Array::Constant(2*m_outerSize, convert_index(m_outerSize))); + } return insertUncompressed(row,col); } From 8580eb6808428a53d5fb91be23fb5c6c8c9e9463 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 13 Mar 2015 21:06:20 +0100 Subject: [PATCH 52/83] bug #949: add static assertion for incompatible scalar types in dense end-user decompositions. --- Eigen/src/Cholesky/LDLT.h | 7 +++++++ Eigen/src/Cholesky/LLT.h | 8 ++++++++ Eigen/src/Eigenvalues/ComplexEigenSolver.h | 8 ++++++++ Eigen/src/Eigenvalues/EigenSolver.h | 9 +++++++++ Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 9 +++++++++ Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 7 +++++++ Eigen/src/LU/FullPivLU.h | 8 ++++++++ Eigen/src/LU/PartialPivLU.h | 8 ++++++++ Eigen/src/QR/ColPivHouseholderQR.h | 8 ++++++++ Eigen/src/QR/FullPivHouseholderQR.h | 8 ++++++++ Eigen/src/QR/HouseholderQR.h | 8 ++++++++ Eigen/src/SVD/SVDBase.h | 10 +++++++++- failtest/CMakeLists.txt | 12 ++++++++++++ failtest/bdcsvd_int.cpp | 14 ++++++++++++++ failtest/colpivqr_int.cpp | 14 ++++++++++++++ failtest/eigensolver_cplx.cpp | 14 ++++++++++++++ failtest/eigensolver_int.cpp | 14 ++++++++++++++ failtest/fullpivlu_int.cpp | 14 ++++++++++++++ failtest/fullpivqr_int.cpp | 14 ++++++++++++++ failtest/jacobisvd_int.cpp | 14 ++++++++++++++ failtest/ldlt_int.cpp | 14 ++++++++++++++ failtest/llt_int.cpp | 14 ++++++++++++++ failtest/partialpivlu_int.cpp | 14 ++++++++++++++ failtest/qr_int.cpp | 14 ++++++++++++++ 24 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 failtest/bdcsvd_int.cpp create mode 100644 failtest/colpivqr_int.cpp create mode 100644 failtest/eigensolver_cplx.cpp create mode 100644 failtest/eigensolver_int.cpp create mode 100644 failtest/fullpivlu_int.cpp create mode 100644 failtest/fullpivqr_int.cpp create mode 100644 failtest/jacobisvd_int.cpp create mode 100644 failtest/ldlt_int.cpp create mode 100644 failtest/llt_int.cpp create mode 100644 failtest/partialpivlu_int.cpp create mode 100644 failtest/qr_int.cpp diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index f46f7b758..93a726483 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -226,6 +226,11 @@ template class LDLT #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } /** \internal * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U. @@ -424,6 +429,8 @@ template struct LDLT_Traits template LDLT& LDLT::compute(const MatrixType& a) { + check_template_parameters(); + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 629c87161..745b74d95 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -170,6 +170,12 @@ template class LLT #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + /** \internal * Used to compute and store L * The strict upper part is not used and even not initialized. @@ -377,6 +383,8 @@ template struct LLT_Traits template LLT& LLT::compute(const MatrixType& a) { + check_template_parameters(); + eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index 075a62848..6b010c312 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -234,6 +234,12 @@ template class ComplexEigenSolver } protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + EigenvectorType m_eivec; EigenvalueType m_eivalues; ComplexSchur m_schur; @@ -251,6 +257,8 @@ template ComplexEigenSolver& ComplexEigenSolver::compute(const MatrixType& matrix, bool computeEigenvectors) { + check_template_parameters(); + // this code is inspired from Jampack eigen_assert(matrix.cols() == matrix.rows()); diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index a63a42341..167cd99ab 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -299,6 +299,13 @@ template class EigenSolver void doComputeEigenvectors(); protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + EIGEN_STATIC_ASSERT(!NumTraits::IsComplex, NUMERIC_TYPE_MUST_BE_REAL); + } + MatrixType m_eivec; EigenvalueType m_eivalues; bool m_isInitialized; @@ -366,6 +373,8 @@ template EigenSolver& EigenSolver::compute(const MatrixType& matrix, bool computeEigenvectors) { + check_template_parameters(); + using std::sqrt; using std::abs; using numext::isfinite; diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index c9da6740a..e2e28cd4a 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -263,6 +263,13 @@ template class GeneralizedEigenSolver } protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + EIGEN_STATIC_ASSERT(!NumTraits::IsComplex, NUMERIC_TYPE_MUST_BE_REAL); + } + MatrixType m_eivec; ComplexVectorType m_alphas; VectorType m_betas; @@ -290,6 +297,8 @@ template GeneralizedEigenSolver& GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors) { + check_template_parameters(); + using std::sqrt; using std::abs; eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows()); diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 66d1154cf..1dcfacf0b 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -347,6 +347,11 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + MatrixType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; @@ -382,6 +387,8 @@ EIGEN_DEVICE_FUNC SelfAdjointEigenSolver& SelfAdjointEigenSolver ::compute(const MatrixType& matrix, int options) { + check_template_parameters(); + using std::abs; eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index d1a260a37..75dbc16b0 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -390,6 +390,12 @@ template class FullPivLU #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + MatrixType m_lu; PermutationPType m_p; PermutationQType m_q; @@ -434,6 +440,8 @@ FullPivLU::FullPivLU(const MatrixType& matrix) template FullPivLU& FullPivLU::compute(const MatrixType& matrix) { + check_template_parameters(); + // the permutations are stored as int indices, so just to be sure: eigen_assert(matrix.rows()<=NumTraits::highest() && matrix.cols()<=NumTraits::highest()); diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 3d8825a97..019fc4230 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -209,6 +209,12 @@ template class PartialPivLU #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + MatrixType m_lu; PermutationType m_p; TranspositionType m_rowsTranspositions; @@ -425,6 +431,8 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t template PartialPivLU& PartialPivLU::compute(const MatrixType& matrix) { + check_template_parameters(); + // the row permutation is stored as int indices, so just to be sure: eigen_assert(matrix.rows()::highest()); diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 03ff0a8f2..7b3842cbe 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -398,6 +398,12 @@ template class ColPivHouseholderQR #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + MatrixType m_qr; HCoeffsType m_hCoeffs; PermutationType m_colsPermutation; @@ -436,6 +442,8 @@ typename MatrixType::RealScalar ColPivHouseholderQR::logAbsDetermina template ColPivHouseholderQR& ColPivHouseholderQR::compute(const MatrixType& matrix) { + check_template_parameters(); + using std::abs; Index rows = matrix.rows(); Index cols = matrix.cols(); diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index 4952fbb46..4c2c958a8 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -380,6 +380,12 @@ template class FullPivHouseholderQR #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; @@ -419,6 +425,8 @@ typename MatrixType::RealScalar FullPivHouseholderQR::logAbsDetermin template FullPivHouseholderQR& FullPivHouseholderQR::compute(const MatrixType& matrix) { + check_template_parameters(); + using std::abs; Index rows = matrix.rows(); Index cols = matrix.cols(); diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 195bacb85..878654be5 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -196,6 +196,12 @@ template class HouseholderQR #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; @@ -348,6 +354,8 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c template HouseholderQR& HouseholderQR::compute(const MatrixType& matrix) { + check_template_parameters(); + Index rows = matrix.rows(); Index cols = matrix.cols(); Index size = (std::min)(rows,cols); diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index 8903755e7..b89393721 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -217,6 +217,12 @@ public: #endif protected: + + static void check_template_parameters() + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); + } + // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; @@ -240,7 +246,9 @@ protected: m_usePrescribedThreshold(false), m_computationOptions(0), m_rows(-1), m_cols(-1), m_diagSize(0) - {} + { + check_template_parameters(); + } }; diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt index c8795a344..d3e82ecd9 100644 --- a/failtest/CMakeLists.txt +++ b/failtest/CMakeLists.txt @@ -47,6 +47,18 @@ ei_add_failtest("sparse_ref_3") ei_add_failtest("sparse_ref_4") ei_add_failtest("sparse_ref_5") +ei_add_failtest("partialpivlu_int") +ei_add_failtest("fullpivlu_int") +ei_add_failtest("llt_int") +ei_add_failtest("ldlt_int") +ei_add_failtest("qr_int") +ei_add_failtest("colpivqr_int") +ei_add_failtest("fullpivqr_int") +ei_add_failtest("jacobisvd_int") +ei_add_failtest("bdcsvd_int") +ei_add_failtest("eigensolver_int") +ei_add_failtest("eigensolver_cplx") + if (EIGEN_FAILTEST_FAILURE_COUNT) message(FATAL_ERROR "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. " diff --git a/failtest/bdcsvd_int.cpp b/failtest/bdcsvd_int.cpp new file mode 100644 index 000000000..670752cf5 --- /dev/null +++ b/failtest/bdcsvd_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/SVD" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + BDCSVD > qr(Matrix::Random(10,10)); +} diff --git a/failtest/colpivqr_int.cpp b/failtest/colpivqr_int.cpp new file mode 100644 index 000000000..db11910d4 --- /dev/null +++ b/failtest/colpivqr_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/QR" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + ColPivHouseholderQR > qr(Matrix::Random(10,10)); +} diff --git a/failtest/eigensolver_cplx.cpp b/failtest/eigensolver_cplx.cpp new file mode 100644 index 000000000..c2e21e189 --- /dev/null +++ b/failtest/eigensolver_cplx.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/Eigenvalues" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR std::complex +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + EigenSolver > eig(Matrix::Random(10,10)); +} diff --git a/failtest/eigensolver_int.cpp b/failtest/eigensolver_int.cpp new file mode 100644 index 000000000..eda8dc20b --- /dev/null +++ b/failtest/eigensolver_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/Eigenvalues" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + EigenSolver > eig(Matrix::Random(10,10)); +} diff --git a/failtest/fullpivlu_int.cpp b/failtest/fullpivlu_int.cpp new file mode 100644 index 000000000..e9d2c6eb3 --- /dev/null +++ b/failtest/fullpivlu_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/LU" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + FullPivLU > lu(Matrix::Random(10,10)); +} diff --git a/failtest/fullpivqr_int.cpp b/failtest/fullpivqr_int.cpp new file mode 100644 index 000000000..d182a7b6b --- /dev/null +++ b/failtest/fullpivqr_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/QR" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + FullPivHouseholderQR > qr(Matrix::Random(10,10)); +} diff --git a/failtest/jacobisvd_int.cpp b/failtest/jacobisvd_int.cpp new file mode 100644 index 000000000..12790aef1 --- /dev/null +++ b/failtest/jacobisvd_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/SVD" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + JacobiSVD > qr(Matrix::Random(10,10)); +} diff --git a/failtest/ldlt_int.cpp b/failtest/ldlt_int.cpp new file mode 100644 index 000000000..243e45746 --- /dev/null +++ b/failtest/ldlt_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/Cholesky" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + LDLT > ldlt(Matrix::Random(10,10)); +} diff --git a/failtest/llt_int.cpp b/failtest/llt_int.cpp new file mode 100644 index 000000000..cb020650d --- /dev/null +++ b/failtest/llt_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/Cholesky" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + LLT > llt(Matrix::Random(10,10)); +} diff --git a/failtest/partialpivlu_int.cpp b/failtest/partialpivlu_int.cpp new file mode 100644 index 000000000..98ef282ea --- /dev/null +++ b/failtest/partialpivlu_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/LU" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + PartialPivLU > lu(Matrix::Random(10,10)); +} diff --git a/failtest/qr_int.cpp b/failtest/qr_int.cpp new file mode 100644 index 000000000..ce200e818 --- /dev/null +++ b/failtest/qr_int.cpp @@ -0,0 +1,14 @@ +#include "../Eigen/QR" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define SCALAR int +#else +#define SCALAR float +#endif + +using namespace Eigen; + +int main() +{ + HouseholderQR > qr(Matrix::Random(10,10)); +} From d99ab35f9e886a014be6d47606d232af1e668f76 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 13 Mar 2015 21:12:46 +0100 Subject: [PATCH 53/83] Fix internal::random(x,y) for integer types. The previous implementation could return y+1. The new implementation uses rejection sampling to get an unbiased behabior. --- Eigen/src/Core/MathFunctions.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 878f38e92..3c76a58b9 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -525,8 +525,25 @@ struct random_default_impl typedef typename NumTraits::NonInteger NonInteger; static inline Scalar run(const Scalar& x, const Scalar& y) - { - return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1))); + { + using std::max; + Scalar range = (max)(Scalar(0),Scalar(y-x)); + Scalar offset = 0; + if(range<=RAND_MAX) + { + // rejection sampling + int divisor = RAND_MAX/(range+1); + + do { + offset = Scalar(std::rand() / divisor); + } while (offset > range); + } + else + { + offset = std::rand() * range; + } + + return x + offset; } static inline Scalar run() From 1330f8bbd12306cc4955d943f27e5281d413bed4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 13 Mar 2015 21:15:50 +0100 Subject: [PATCH 54/83] bug #973, improve AVX support by enabling vectorization of Vector4i-like types, and enforcing alignement of Vector4f/Vector2d-like types to preserve compatibility with SSE and future Eigen versions that will vectorize them with AVX enabled. --- Eigen/src/Core/CoreEvaluators.h | 16 ++++-- Eigen/src/Core/DenseStorage.h | 88 +++++++++++++++++++++++++++++++-- Eigen/src/Core/util/Macros.h | 3 ++ Eigen/src/Core/util/XprHelper.h | 5 +- test/unalignedassert.cpp | 7 +-- test/vectorization_logic.cpp | 2 +- 6 files changed, 108 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 9485080d3..85f46cb8d 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -647,11 +647,15 @@ struct evaluator > HasNoStride = HasNoInnerStride && HasNoOuterStride, IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned), IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic, + + // TODO: should check for smaller packet types once we can handle multi-sized packet types + AlignBytes = int(packet_traits::size) * sizeof(Scalar), + KeepsPacketAccess = bool(HasNoInnerStride) && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), + && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime) % AlignBytes)==0 ) ), Flags0 = evaluator::Flags, Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) @@ -717,7 +721,10 @@ struct evaluator > && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, + // TODO: should check for smaller packet types once we can handle multi-sized packet types + AlignBytes = int(packet_traits::size) * sizeof(Scalar), + + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignedBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, Flags0 = evaluator::Flags & ( (HereditaryBits & ~RowMajorBit) | @@ -825,12 +832,15 @@ struct block_evaluator::PlainObject> { typedef Block XprType; + typedef typename XprType::Scalar Scalar; EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) : mapbase_evaluator(block) { + // TODO: should check for smaller packet types once we can handle multi-sized packet types + const int AlignBytes = int(packet_traits::size) * sizeof(Scalar); // FIXME this should be an internal assertion - eigen_assert(EIGEN_IMPLIES(evaluator::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(evaluator::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned"); } }; diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 9186f59a7..522aaa299 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -34,14 +34,35 @@ void check_static_allocation_size() #endif } +template::type, + bool Match = bool((Size%unpacket_traits::size)==0), + bool TryHalf = bool(unpacket_traits::size > Size) + && bool(unpacket_traits::size > unpacket_traits::half>::size) > +struct compute_default_alignment +{ + enum { value = 0 }; +}; + +template +struct compute_default_alignment // Match +{ + enum { value = sizeof(T) * unpacket_traits::size }; +}; + +template +struct compute_default_alignment +{ + // current packet too large, try with an half-packet + enum { value = compute_default_alignment::half>::value }; +}; + /** \internal * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned: * to 16 bytes boundary if the total size is a multiple of 16 bytes. */ template + : compute_default_alignment::value > struct plain_array { T array[Size]; @@ -81,14 +102,71 @@ struct plain_array #endif template -struct plain_array +struct plain_array { - EIGEN_USER_ALIGN_DEFAULT T array[Size]; + EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; EIGEN_DEVICE_FUNC plain_array() { - EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1); + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); + check_static_allocation_size(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; + +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); + check_static_allocation_size(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; + +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); + check_static_allocation_size(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; + +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size(); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index aaea9f035..6b294e77f 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -318,6 +318,9 @@ // Defined the boundary (in bytes) on which the data needs to be aligned. Note // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be // aligned at all regardless of the value of this #define. +// TODO should be renamed EIGEN_MAXIMAL_ALIGN_BYTES, +// for instance with AVX 1 EIGEN_MAXIMAL_ALIGN_BYTES=32 while for 'int' 16 bytes alignment is always enough, +// and 16 bytes alignment is also enough for Vector4f. #define EIGEN_ALIGN_BYTES 16 #ifdef EIGEN_DONT_ALIGN diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 528ebe297..562f425bd 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -159,13 +159,16 @@ class compute_matrix_evaluator_flags enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0, is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic, + + // TODO: should check for smaller packet types once we can handle multi-sized packet types + align_bytes = int(packet_traits::size) * sizeof(Scalar), aligned_bit = ( ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % align_bytes) == 0)) #else 0 #endif diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index d8815263a..6f7b72167 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -81,7 +81,7 @@ void construct_at_boundary(int boundary) void unalignedassert() { - #if EIGEN_ALIGN_STATICALLY +#if EIGEN_ALIGN_STATICALLY construct_at_boundary(4); construct_at_boundary(4); construct_at_boundary(16); @@ -100,7 +100,7 @@ void unalignedassert() construct_at_boundary(4); construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); - #endif +#endif check_unalignedassert_good(); check_unalignedassert_good(); @@ -112,11 +112,12 @@ void unalignedassert() check_unalignedassert_good >(); #if EIGEN_ALIGN_STATICALLY - if(EIGEN_ALIGN_BYTES==16) + if(EIGEN_ALIGN_BYTES>=16) { VERIFY_RAISES_ASSERT(construct_at_boundary(8)); VERIFY_RAISES_ASSERT(construct_at_boundary(8)); VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); } for(int b=8; b::Vectori >(DefaultTraversal,CompleteUnrolling))); VERIFY((test_assign(Matrix11(), Matrix()*Matrix(), - PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling))); + InnerVectorizedTraversal, CompleteUnrolling))); #endif VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3), From 488c15615a31fcee310200d726d8e58bb005bc87 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 13 Mar 2015 14:51:26 -0700 Subject: [PATCH 55/83] organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) --- .../Core/products/GeneralBlockPanelKernel.h | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 408281c82..1b47f1a6d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -25,21 +25,31 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } +#if EIGEN_ARCH_i386_OR_x86_64 +const std::ptrdiff_t defaultL1CacheSize = 32*1024; +const std::ptrdiff_t defaultL2CacheSize = 256*1024; +const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +#else +const std::ptrdiff_t defaultL1CacheSize = 16*1024; +const std::ptrdiff_t defaultL2CacheSize = 512*1024; +const std::ptrdiff_t defaultL3CacheSize = 512*1024; +#endif + /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { static bool m_cache_sizes_initialized = false; - static std::ptrdiff_t m_l1CacheSize = 32*1024; - static std::ptrdiff_t m_l2CacheSize = 256*1024; - static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + static std::ptrdiff_t m_l1CacheSize = 0; + static std::ptrdiff_t m_l2CacheSize = 0; + static std::ptrdiff_t m_l3CacheSize = 0; if(!m_cache_sizes_initialized) { int l1CacheSize, l2CacheSize, l3CacheSize; queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); - m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); - m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); - m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize); m_cache_sizes_initialized = true; } @@ -974,7 +984,7 @@ void gebp_kernel(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); #else @@ -1211,7 +1221,7 @@ void gebp_kernel=2*Traits::LhsProgress) { - const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function. + const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES Index actual_panel_rows = (2*LhsProgress) * std::max(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); #else From b6b88c08082dcfc5dd81c6997d6090507267cc13 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 13 Mar 2015 14:57:05 -0700 Subject: [PATCH 56/83] update perf_monitoring/gemm/changesets.txt --- bench/perf_monitoring/gemm/changesets.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index a5b63bc89..40a71c781 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -41,3 +41,5 @@ before-evaluators 6981:7e5d6f78da59 # dynamic loop swapping 6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache 6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. +7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) + From e56aabf205a1e8f581dd8a46d7d46ce79c45e158 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sun, 15 Mar 2015 18:05:12 -0400 Subject: [PATCH 57/83] Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables --- Eigen/Core | 1 + Eigen/src/Core/MathFunctions.h | 36 +++---- .../Core/products/GeneralBlockPanelKernel.h | 100 ++++++++++++------ .../Core/products/LookupBlockingSizesTable.h | 89 ++++++++++++++++ Eigen/src/Core/util/ForwardDeclarations.h | 8 ++ 5 files changed, 182 insertions(+), 52 deletions(-) create mode 100644 Eigen/src/Core/products/LookupBlockingSizesTable.h diff --git a/Eigen/Core b/Eigen/Core index 0b8eaa61c..138c34916 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -381,6 +381,7 @@ using std::ptrdiff_t; #include "src/Core/Inverse.h" #include "src/Core/TriangularMatrix.h" #include "src/Core/SelfAdjointView.h" +#include "src/Core/products/LookupBlockingSizesTable.h" #include "src/Core/products/GeneralBlockPanelKernel.h" #include "src/Core/products/Parallelizer.h" #include "src/Core/ProductEvaluators.h" diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 3c76a58b9..0fde5c71e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -473,48 +473,48 @@ struct random_default_impl }; enum { - floor_log2_terminate, - floor_log2_move_up, - floor_log2_move_down, - floor_log2_bogus + meta_floor_log2_terminate, + meta_floor_log2_move_up, + meta_floor_log2_move_down, + meta_floor_log2_bogus }; -template struct floor_log2_selector +template struct meta_floor_log2_selector { enum { middle = (lower + upper) / 2, - value = (upper <= lower + 1) ? int(floor_log2_terminate) - : (n < (1 << middle)) ? int(floor_log2_move_down) - : (n==0) ? int(floor_log2_bogus) - : int(floor_log2_move_up) + value = (upper <= lower + 1) ? int(meta_floor_log2_terminate) + : (n < (1 << middle)) ? int(meta_floor_log2_move_down) + : (n==0) ? int(meta_floor_log2_bogus) + : int(meta_floor_log2_move_up) }; }; template::value> -struct floor_log2 {}; + int selector = meta_floor_log2_selector::value> +struct meta_floor_log2 {}; template -struct floor_log2 +struct meta_floor_log2 { - enum { value = floor_log2::middle>::value }; + enum { value = meta_floor_log2::middle>::value }; }; template -struct floor_log2 +struct meta_floor_log2 { - enum { value = floor_log2::middle, upper>::value }; + enum { value = meta_floor_log2::middle, upper>::value }; }; template -struct floor_log2 +struct meta_floor_log2 { enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower }; }; template -struct floor_log2 +struct meta_floor_log2 { // no value, error at compile time }; @@ -551,7 +551,7 @@ struct random_default_impl #ifdef EIGEN_MAKING_DOCS return run(Scalar(NumTraits::IsSigned ? -10 : 0), Scalar(10)); #else - enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value, + enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value, scalar_bits = sizeof(Scalar) * CHAR_BIT, shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)), offset = NumTraits::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0 diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1b47f1a6d..617439ff6 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -74,45 +74,23 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff } } -/** \brief Computes the blocking parameters for a m x k times k x n matrix product - * - * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. - * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. - * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. - * - * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, - * this function computes the blocking size parameters along the respective dimensions - * for matrix products and related algorithms. The blocking sizes depends on various - * parameters: - * - the L1 and L2 cache sizes, - * - the register level blocking sizes defined by gebp_traits, - * - the number of scalars that fit into a packet (when vectorization is enabled). - * - * \sa setCpuCacheSizes */ +/* Helper for computeProductBlockingSizes. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by gebp_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). + * + * \sa setCpuCacheSizes */ template -void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) { typedef gebp_traits Traits; -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - EIGEN_UNUSED_VARIABLE(num_threads); - enum { - kr = 8, - mr = Traits::mr, - nr = Traits::nr - }; - k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); - if (k > kr) k -= k % kr; - m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); - if (m > mr) m -= m % mr; - n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); - if (n > nr) n -= n % nr; - return; - } -#endif - // Explanations: // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed @@ -281,6 +259,60 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads } } +inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) +{ +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + return true; + } +#else + EIGEN_UNUSED_VARIABLE(k) + EIGEN_UNUSED_VARIABLE(m) + EIGEN_UNUSED_VARIABLE(n) + return false; +#endif +} + +/** \brief Computes the blocking parameters for a m x k times k x n matrix product + * + * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. + * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. + * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. + * + * The blocking size parameters may be evaluated: + * - either by a heuristic based on cache sizes; + * - or using a precomputed lookup table; + * - or using fixed prescribed values (for testing purposes). + * + * \sa setCpuCacheSizes */ + +template +void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + if (!useSpecificBlockingSizes(k, m, n)) { + if (!lookupBlockingSizesFromTable(k, m, n, num_threads)) { + evaluateProductBlockingSizesHeuristic(k, m, n, num_threads); + } + } + + typedef gebp_traits Traits; + enum { + kr = 8, + mr = Traits::mr, + nr = Traits::nr + }; + if (k > kr) k -= k % kr; + if (m > mr) m -= m % mr; + if (n > nr) n -= n % nr; +} + template inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h new file mode 100644 index 000000000..85aeedec8 --- /dev/null +++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h @@ -0,0 +1,89 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H +#define EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H + +namespace Eigen { + +namespace internal { + +template ::NumSizes != 0 > +struct LookupBlockingSizesFromTableImpl +{ + static bool run(Index&, Index&, Index&, Index) + { + return false; + } +}; + +inline uint8_t floor_log2_helper(uint16_t& x, size_t offset) +{ + uint16_t y = x >> offset; + if (y) { + x = y; + return offset; + } else { + return 0; + } +} + +inline uint8_t floor_log2(uint16_t x) +{ + return floor_log2_helper(x, 8) + + floor_log2_helper(x, 4) + + floor_log2_helper(x, 2) + + floor_log2_helper(x, 1); +} + +inline uint8_t ceil_log2(uint16_t x) +{ + return x > 1 ? floor_log2(x - 1) + 1 : 0; +} + +template +struct LookupBlockingSizesFromTableImpl +{ + static bool run(Index& k, Index& m, Index& n, Index) + { + using std::min; + using std::max; + typedef BlockingSizesLookupTable Table; + const uint16_t minsize = Table::BaseSize; + const uint16_t maxsize = minsize << (Table::NumSizes + 1); + const uint16_t k_clamped = max(minsize, min(k, maxsize)); + const uint16_t m_clamped = max(minsize, min(m, maxsize)); + const uint16_t n_clamped = max(minsize, min(n, maxsize)); + const size_t k_index = ceil_log2(k_clamped / minsize); + const size_t m_index = ceil_log2(m_clamped / minsize); + const size_t n_index = ceil_log2(n_clamped / minsize); + const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index); + const uint16_t table_entry = Table::Data()[index]; + k = min(k, 1 << ((table_entry & 0xf00) >> 8)); + m = min(m, 1 << ((table_entry & 0x0f0) >> 4)); + n = min(n, 1 << ((table_entry & 0x00f) >> 0)); + return true; + } +}; + +template +bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads) +{ + return LookupBlockingSizesFromTableImpl::run(k, m, n, num_threads); +} + +} + +} + +#endif // EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index c23892c50..8034f9b5e 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -287,6 +287,14 @@ struct stem_function typedef std::complex::Real> ComplexScalar; typedef ComplexScalar type(ComplexScalar, int); }; + +template +struct BlockingSizesLookupTable +{ + static const size_t NumSizes = 0; +}; + } } // end namespace Eigen From ca5c12587b6e51be7f401c2878800d5d49f615d8 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sun, 15 Mar 2015 18:05:53 -0400 Subject: [PATCH 58/83] Polish lookup tables generation --- bench/analyze-blocking-sizes.cpp | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index c133df599..3d57f4bb3 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -347,13 +347,16 @@ void dump_table_for_subset( abort(); } if (only_cubic_sizes) { - cout << "/* Warning: generated with --only-cubic-sizes ! */" << endl; + cerr << "Can't generate tables with --only-cubic-sizes." << endl; + abort(); } - cout << "struct optimal_block_sizes_table {" << endl; - cout << " static const size_t min_size = " << min_product_size.k << ";" << endl; - cout << " static const size_t max_size = " << max_product_size.k << ";" << endl; - cout << " static const uint16_t* table() {" << endl; - cout << " static const uint16_t data[] = {"; + cout << "struct LookupTable {" << endl; + cout << " static const size_t BaseSize = " << min_product_size.k << ";" << endl; + const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1; + const size_t TableSize = NumSizes * NumSizes * NumSizes; + cout << " static const size_t NumSizes = " << NumSizes << ";" << endl; + cout << " static const uint16_t* Data() {" << endl; + cout << " static const uint16_t data[" << TableSize << "] = {"; while (entry_index < num_entries) { ++entry_index; if (entry_index == num_entries || @@ -371,18 +374,23 @@ void dump_table_for_subset( best_block_size_this_product_size = first_file.entries[e].block_size; } } - if ((i++) % 8) { - cout << ", "; + if ((i++) % NumSizes) { + cout << " "; } else { cout << endl << " "; } cout << "0x" << hex << best_block_size_this_product_size << dec; if (entry_index < num_entries) { + cout << ","; first_entry_index_with_this_product_size = entry_index; product_size = first_file.entries[entry_index].product_size; } } } + if (i != TableSize) { + cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl; + abort(); + } cout << endl << " };" << endl; cout << " return data;" << endl; cout << " }" << endl; @@ -855,6 +863,11 @@ int main(int argc, char* argv[]) input_filenames.emplace_back(argv[i]); } + if (dump_tables && only_cubic_sizes) { + cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl; + show_usage_and_exit(argc, argv, available_actions); + } + if (!action) { show_usage_and_exit(argc, argv, available_actions); } From 1dd3d89818cbdf0843de3c0b499eedc433af46f8 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sun, 15 Mar 2015 18:07:19 -0400 Subject: [PATCH 59/83] Fix a unused-var warning --- Eigen/src/Core/CoreEvaluators.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 85f46cb8d..ce00566a5 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -839,6 +839,7 @@ struct block_evaluator::size) * sizeof(Scalar); + EIGEN_ONLY_USED_FOR_DEBUG(AlignBytes) // FIXME this should be an internal assertion eigen_assert(EIGEN_IMPLIES(evaluator::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned"); } From 3589a9c115a892ea3ca5dac74d71a1526764cb38 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sun, 15 Mar 2015 18:12:18 -0400 Subject: [PATCH 60/83] actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 617439ff6..6aadfea83 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1017,11 +1017,10 @@ void gebp_kernel(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); -#else - const Index actual_panel_rows = (3*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ); -#endif for(Index i1=0; i1 Date: Sun, 15 Mar 2015 18:13:12 -0400 Subject: [PATCH 61/83] Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now. --- Eigen/Core | 1 + .../arch/NEON/BlockingSizesLookupTables.h | 110 ++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h diff --git a/Eigen/Core b/Eigen/Core index 138c34916..1a3249604 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -308,6 +308,7 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/PacketMath.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" + #include "src/Core/arch/NEON/BlockingSizesLookupTables.h" #endif #if defined EIGEN_VECTORIZE_CUDA diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h new file mode 100644 index 000000000..c2366a347 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H +#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H + +namespace Eigen { +namespace internal { + +/* The following lookup table was generated from measurements on a Nexus 5, + * which has a Qualcomm Krait 400 CPU. This is very representative of current + * 32bit (ARMv7) Android devices. On the other hand, I don't know how + * representative that is outside of these conditions. Accordingly, + * let's only use this lookup table on ARM 32bit on Android for now. + * + * Measurements were single-threaded, with Scalar=float, compiled with + * -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32. + * + * The device was cooled, allowing it to run a the max clock speed throughout. + * This may not be representative of real-world thermal conditions. + * + * The benchmark attempted to flush caches to test cold-cache performance. + */ +#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID +template<> +struct BlockingSizesLookupTable { + static const size_t BaseSize = 16; + static const size_t NumSizes = 8; + static const uint16_t* Data() { + static const uint16_t data[512] = { + 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447, + 0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x457, + 0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467, + 0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x477, 0x478, + 0x474, 0x475, 0x476, 0x477, 0x478, 0x476, 0x476, 0x476, + 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x496, + 0x474, 0x475, 0x476, 0x4a6, 0x4a5, 0x4a6, 0x4a5, 0x4a6, + 0x474, 0x475, 0x466, 0x4a6, 0x4a6, 0x4a6, 0x496, 0x4a6, + 0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b, + 0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b, + 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b, + 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576, + 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587, + 0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x597, 0x596, + 0x564, 0x565, 0x566, 0x5a5, 0x5a6, 0x5a6, 0x596, 0x596, + 0x564, 0x565, 0x566, 0x596, 0x5a6, 0x596, 0x5a6, 0x5a6, + 0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b, + 0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b, + 0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667, + 0x674, 0x665, 0x666, 0x677, 0x678, 0x679, 0x67a, 0x675, + 0x684, 0x685, 0x686, 0x687, 0x678, 0x688, 0x687, 0x686, + 0x664, 0x665, 0x666, 0x657, 0x697, 0x696, 0x696, 0x697, + 0x664, 0x655, 0x686, 0x696, 0x685, 0x6a6, 0x686, 0x686, + 0x684, 0x675, 0x686, 0x685, 0x686, 0x696, 0x696, 0x696, + 0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x746, + 0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757, + 0x764, 0x755, 0x756, 0x747, 0x768, 0x759, 0x75a, 0x767, + 0x744, 0x765, 0x766, 0x767, 0x768, 0x759, 0x778, 0x777, + 0x744, 0x745, 0x766, 0x767, 0x788, 0x788, 0x786, 0x788, + 0x754, 0x755, 0x766, 0x787, 0x787, 0x796, 0x787, 0x797, + 0x684, 0x695, 0x696, 0x6a5, 0x786, 0x786, 0x795, 0x796, + 0x684, 0x695, 0x686, 0x6a6, 0x786, 0x796, 0x786, 0x796, + 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x847, 0x848, + 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x857, 0x858, + 0x844, 0x865, 0x846, 0x847, 0x868, 0x849, 0x866, 0x867, + 0x844, 0x865, 0x846, 0x847, 0x878, 0x849, 0x877, 0x877, + 0x844, 0x845, 0x846, 0x867, 0x885, 0x887, 0x885, 0x887, + 0x784, 0x785, 0x786, 0x877, 0x885, 0x885, 0x896, 0x896, + 0x684, 0x695, 0x696, 0x885, 0x896, 0x885, 0x895, 0x895, + 0x694, 0x685, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896, + 0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x848, 0x848, + 0x944, 0x855, 0x756, 0x947, 0x858, 0x857, 0x858, 0x858, + 0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x866, 0x867, + 0x944, 0x775, 0x976, 0x877, 0x877, 0x878, 0x877, 0x877, + 0x784, 0x785, 0x886, 0x887, 0x886, 0x986, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x796, 0x885, 0x897, 0x896, 0x897, + 0x684, 0x695, 0x6a4, 0x886, 0x886, 0x896, 0x896, 0x896, + 0x6a4, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896, + 0x844, 0x845, 0x846, 0x847, 0x847, 0x847, 0x847, 0x847, + 0x854, 0x855, 0x856, 0x857, 0x857, 0x858, 0x857, 0x857, + 0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x867, 0x867, + 0x774, 0x775, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897, + 0x684, 0x6a5, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896, + 0x694, 0x6a5, 0x6a5, 0x886, 0xa68, 0x896, 0x896, 0x896, + 0x844, 0x845, 0x846, 0x846, 0x847, 0x945, 0x847, 0x946, + 0x854, 0xb55, 0x856, 0x857, 0x857, 0x856, 0x857, 0x857, + 0x864, 0x865, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867, + 0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x784, 0x885, 0x886, 0x886, 0x886, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x786, 0x886, 0x897, 0x897, 0x897, + 0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896, + 0x684, 0x6a5, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896 + }; + return data; + } +}; +#endif + +} +} + +#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H From 151b8b95c642dea388c2f7da475e4db432bc6962 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sun, 15 Mar 2015 19:10:51 -0400 Subject: [PATCH 62/83] Fix bug in case where EIGEN_TEST_SPECIFIC_BLOCKING_SIZE is defined but false --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 6aadfea83..bc24a17ac 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -272,8 +272,8 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) EIGEN_UNUSED_VARIABLE(k) EIGEN_UNUSED_VARIABLE(m) EIGEN_UNUSED_VARIABLE(n) - return false; #endif + return false; } /** \brief Computes the blocking parameters for a m x k times k x n matrix product From e274607d7fcb021d0578e2fc95023eb7b5fab133 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 16 Mar 2015 10:48:27 -0400 Subject: [PATCH 63/83] fix compilation with GCC 4.8 --- Eigen/src/Core/products/LookupBlockingSizesTable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h index 85aeedec8..f955253f8 100644 --- a/Eigen/src/Core/products/LookupBlockingSizesTable.h +++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h @@ -68,9 +68,9 @@ struct LookupBlockingSizesFromTableImpl const size_t n_index = ceil_log2(n_clamped / minsize); const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index); const uint16_t table_entry = Table::Data()[index]; - k = min(k, 1 << ((table_entry & 0xf00) >> 8)); - m = min(m, 1 << ((table_entry & 0x0f0) >> 4)); - n = min(n, 1 << ((table_entry & 0x00f) >> 0)); + k = min(k, 1 << ((table_entry & 0xf00) >> 8)); + m = min(m, 1 << ((table_entry & 0x0f0) >> 4)); + n = min(n, 1 << ((table_entry & 0x00f) >> 0)); return true; } }; From 35c3a8bb84778a81b2f90fdea10eadeae16863aa Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 16 Mar 2015 11:05:51 -0400 Subject: [PATCH 64/83] Update Nexus 5 lookup table from combining now 2 runs of the benchmark, using the analyze-blocking-sizes partition tool. Gives better worst-case performance. --- .../arch/NEON/BlockingSizesLookupTables.h | 98 +++++++++---------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h index c2366a347..7905bdf83 100644 --- a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h +++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h @@ -35,69 +35,69 @@ struct BlockingSizesLookupTable { static const uint16_t* Data() { static const uint16_t data[512] = { 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447, - 0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x457, + 0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456, 0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467, - 0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x477, 0x478, - 0x474, 0x475, 0x476, 0x477, 0x478, 0x476, 0x476, 0x476, - 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x496, - 0x474, 0x475, 0x476, 0x4a6, 0x4a5, 0x4a6, 0x4a5, 0x4a6, - 0x474, 0x475, 0x466, 0x4a6, 0x4a6, 0x4a6, 0x496, 0x4a6, + 0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478, + 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476, + 0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488, + 0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6, + 0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5, 0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b, 0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b, 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b, 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576, 0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587, - 0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x597, 0x596, - 0x564, 0x565, 0x566, 0x5a5, 0x5a6, 0x5a6, 0x596, 0x596, - 0x564, 0x565, 0x566, 0x596, 0x5a6, 0x596, 0x5a6, 0x5a6, + 0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597, + 0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6, + 0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6, 0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b, 0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b, 0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667, - 0x674, 0x665, 0x666, 0x677, 0x678, 0x679, 0x67a, 0x675, - 0x684, 0x685, 0x686, 0x687, 0x678, 0x688, 0x687, 0x686, - 0x664, 0x665, 0x666, 0x657, 0x697, 0x696, 0x696, 0x697, - 0x664, 0x655, 0x686, 0x696, 0x685, 0x6a6, 0x686, 0x686, - 0x684, 0x675, 0x686, 0x685, 0x686, 0x696, 0x696, 0x696, - 0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x746, + 0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675, + 0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686, + 0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697, + 0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696, + 0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696, + 0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747, 0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757, - 0x764, 0x755, 0x756, 0x747, 0x768, 0x759, 0x75a, 0x767, - 0x744, 0x765, 0x766, 0x767, 0x768, 0x759, 0x778, 0x777, - 0x744, 0x745, 0x766, 0x767, 0x788, 0x788, 0x786, 0x788, - 0x754, 0x755, 0x766, 0x787, 0x787, 0x796, 0x787, 0x797, - 0x684, 0x695, 0x696, 0x6a5, 0x786, 0x786, 0x795, 0x796, - 0x684, 0x695, 0x686, 0x6a6, 0x786, 0x796, 0x786, 0x796, - 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x847, 0x848, - 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x857, 0x858, - 0x844, 0x865, 0x846, 0x847, 0x868, 0x849, 0x866, 0x867, - 0x844, 0x865, 0x846, 0x847, 0x878, 0x849, 0x877, 0x877, - 0x844, 0x845, 0x846, 0x867, 0x885, 0x887, 0x885, 0x887, - 0x784, 0x785, 0x786, 0x877, 0x885, 0x885, 0x896, 0x896, - 0x684, 0x695, 0x696, 0x885, 0x896, 0x885, 0x895, 0x895, - 0x694, 0x685, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896, - 0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x848, 0x848, - 0x944, 0x855, 0x756, 0x947, 0x858, 0x857, 0x858, 0x858, - 0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x866, 0x867, - 0x944, 0x775, 0x976, 0x877, 0x877, 0x878, 0x877, 0x877, - 0x784, 0x785, 0x886, 0x887, 0x886, 0x986, 0x887, 0x887, - 0x784, 0x785, 0x786, 0x796, 0x885, 0x897, 0x896, 0x897, - 0x684, 0x695, 0x6a4, 0x886, 0x886, 0x896, 0x896, 0x896, - 0x6a4, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896, - 0x844, 0x845, 0x846, 0x847, 0x847, 0x847, 0x847, 0x847, - 0x854, 0x855, 0x856, 0x857, 0x857, 0x858, 0x857, 0x857, - 0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x867, 0x867, - 0x774, 0x775, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766, + 0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777, + 0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788, + 0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796, + 0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796, + 0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796, + 0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848, + 0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857, + 0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865, + 0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877, + 0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886, + 0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896, + 0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896, + 0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896, + 0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848, + 0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858, + 0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867, + 0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877, + 0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896, + 0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896, + 0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896, + 0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848, + 0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857, + 0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867, + 0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, 0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887, 0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897, - 0x684, 0x6a5, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896, - 0x694, 0x6a5, 0x6a5, 0x886, 0xa68, 0x896, 0x896, 0x896, - 0x844, 0x845, 0x846, 0x846, 0x847, 0x945, 0x847, 0x946, - 0x854, 0xb55, 0x856, 0x857, 0x857, 0x856, 0x857, 0x857, - 0x864, 0x865, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867, + 0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896, + 0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896, + 0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946, + 0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856, + 0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867, 0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877, - 0x784, 0x885, 0x886, 0x886, 0x886, 0x887, 0x887, 0x887, - 0x784, 0x785, 0x786, 0x786, 0x886, 0x897, 0x897, 0x897, + 0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887, + 0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897, 0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896, - 0x684, 0x6a5, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896 + 0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896 }; return data; } From f218c0181d44d7dd129a77108ad6ec063cfbd6cc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 16 Mar 2015 13:05:00 -0700 Subject: [PATCH 65/83] Fixes the Lvalue computation by actually setting the LvalueBit properly when instantiating tensors of const T. Added a test to check the fix. --- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 8 ++-- unsupported/test/cxx11_tensor_ref.cpp | 40 +++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index a844a4d68..66ddfd554 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -54,7 +54,7 @@ struct traits > static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, - Flags = compute_tensor_flags::ret | LvalueBit, + Flags = compute_tensor_flags::ret | (is_const::value ? 0 : LvalueBit), }; }; @@ -69,7 +69,7 @@ struct traits > static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, - Flags = compute_tensor_flags::ret | LvalueBit, + Flags = compute_tensor_flags::ret | (is_const::value ? 0: LvalueBit), }; }; @@ -86,7 +86,7 @@ struct traits > static const int Layout = BaseTraits::Layout; enum { Options = Options_, - Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), }; }; @@ -102,7 +102,7 @@ struct traits > static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, - Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + Flags = (BaseTraits::Flags & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), }; }; diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index aa369f278..c7b5ecddb 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -196,6 +196,45 @@ static void test_coeff_ref() } +static void test_nested_ops_with_ref() +{ + Tensor t(2, 3, 5, 7); + t.setRandom(); + TensorMap > m(t.data(), 2, 3, 5, 7); + array, 4> paddings; + paddings[0] = make_pair(0, 0); + paddings[1] = make_pair(2, 1); + paddings[2] = make_pair(3, 4); + paddings[3] = make_pair(0, 0); + Eigen::DSizes shuffle_dims{0, 1, 2, 3}; + TensorRef > ref(m.pad(paddings)); + array, 4> trivial; + trivial[0] = make_pair(0, 0); + trivial[1] = make_pair(0, 0); + trivial[2] = make_pair(0, 0); + trivial[3] = make_pair(0, 0); + Tensor padded = ref.shuffle(shuffle_dims).pad(trivial); + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } +} + + void test_cxx11_tensor_ref() { CALL_SUBTEST(test_simple_lvalue_ref()); @@ -205,4 +244,5 @@ void test_cxx11_tensor_ref() CALL_SUBTEST(test_ref_of_ref()); CALL_SUBTEST(test_ref_in_expr()); CALL_SUBTEST(test_coeff_ref()); + CALL_SUBTEST(test_nested_ops_with_ref()); } From eb6929cb191c1b91dda784697faedcfdb245345a Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 16 Mar 2015 16:15:47 -0400 Subject: [PATCH 66/83] fix bug in maxsize calculation, which would cause products of size > 2048 to address the lookup table out of bounds --- Eigen/src/Core/products/LookupBlockingSizesTable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h index f955253f8..3c8aba6f8 100644 --- a/Eigen/src/Core/products/LookupBlockingSizesTable.h +++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h @@ -59,7 +59,7 @@ struct LookupBlockingSizesFromTableImpl using std::max; typedef BlockingSizesLookupTable Table; const uint16_t minsize = Table::BaseSize; - const uint16_t maxsize = minsize << (Table::NumSizes + 1); + const uint16_t maxsize = minsize << (Table::NumSizes - 1); const uint16_t k_clamped = max(minsize, min(k, maxsize)); const uint16_t m_clamped = max(minsize, min(m, maxsize)); const uint16_t n_clamped = max(minsize, min(n, maxsize)); From 0fd6d52724555d70f663c7ec56db6419e95db6cc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 16 Mar 2015 13:16:12 -0700 Subject: [PATCH 67/83] Fixed compilation error with clang --- unsupported/test/cxx11_tensor_ref.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index c7b5ecddb..59530fe43 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -201,18 +201,18 @@ static void test_nested_ops_with_ref() Tensor t(2, 3, 5, 7); t.setRandom(); TensorMap > m(t.data(), 2, 3, 5, 7); - array, 4> paddings; - paddings[0] = make_pair(0, 0); - paddings[1] = make_pair(2, 1); - paddings[2] = make_pair(3, 4); - paddings[3] = make_pair(0, 0); - Eigen::DSizes shuffle_dims{0, 1, 2, 3}; + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + DSizes shuffle_dims{0, 1, 2, 3}; TensorRef > ref(m.pad(paddings)); - array, 4> trivial; - trivial[0] = make_pair(0, 0); - trivial[1] = make_pair(0, 0); - trivial[2] = make_pair(0, 0); - trivial[3] = make_pair(0, 0); + array, 4> trivial; + trivial[0] = std::make_pair(0, 0); + trivial[1] = std::make_pair(0, 0); + trivial[2] = std::make_pair(0, 0); + trivial[3] = std::make_pair(0, 0); Tensor padded = ref.shuffle(shuffle_dims).pad(trivial); VERIFY_IS_EQUAL(padded.dimension(0), 2+0); VERIFY_IS_EQUAL(padded.dimension(1), 3+3); From 5144f66728fb156402799682cfd926faf0e98fb4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 16 Mar 2015 13:17:52 -0700 Subject: [PATCH 68/83] Fixed compilation warning --- unsupported/test/cxx11_tensor_ref.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index 59530fe43..c8f105e3d 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -206,7 +206,7 @@ static void test_nested_ops_with_ref() paddings[1] = std::make_pair(2, 1); paddings[2] = std::make_pair(3, 4); paddings[3] = std::make_pair(0, 0); - DSizes shuffle_dims{0, 1, 2, 3}; + DSizes shuffle_dims(0, 1, 2, 3); TensorRef > ref(m.pad(paddings)); array, 4> trivial; trivial[0] = std::make_pair(0, 0); From 577056aa9400459f1a1bad9e423857bc1763f18f Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 16 Mar 2015 16:21:50 -0400 Subject: [PATCH 69/83] Include stdint.h. Not going for cstdint because it is a C++11 addition. Needed for uint16_t at least, in lookup-table code. --- Eigen/Core | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/Core b/Eigen/Core index 1a3249604..80842a1de 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -61,6 +61,7 @@ #pragma GCC optimize ("-fno-ipa-cp-clone") #endif +#include #include // this include file manages BLAS and MKL related macros From 364cfd529d853545e6b0a7404fe303cda2de2366 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 16 Mar 2015 16:28:44 -0400 Subject: [PATCH 70/83] Similar to cset 3589a9c115a892ea3ca5dac74d71a1526764cb38 , also in 2px4 kernel: actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bc24a17ac..d32377a00 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1253,11 +1253,11 @@ void gebp_kernel=2*Traits::LhsProgress) { const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size + // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), + // or because we are testing specific blocking sizes. Index actual_panel_rows = (2*LhsProgress) * std::max(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); -#else - Index actual_panel_rows = (2*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ); -#endif + for(Index i1=peeled_mc3; i1 Date: Tue, 17 Mar 2015 10:31:45 -0400 Subject: [PATCH 71/83] use unsigned short instead of uint16_t which doesn't exist in c++98 --- Eigen/Core | 1 - .../arch/NEON/BlockingSizesLookupTables.h | 4 ++-- .../Core/products/LookupBlockingSizesTable.h | 20 +++++++++---------- bench/analyze-blocking-sizes.cpp | 4 ++-- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 80842a1de..1a3249604 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -61,7 +61,6 @@ #pragma GCC optimize ("-fno-ipa-cp-clone") #endif -#include #include // this include file manages BLAS and MKL related macros diff --git a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h index 7905bdf83..5007c155d 100644 --- a/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h +++ b/Eigen/src/Core/arch/NEON/BlockingSizesLookupTables.h @@ -32,8 +32,8 @@ template<> struct BlockingSizesLookupTable { static const size_t BaseSize = 16; static const size_t NumSizes = 8; - static const uint16_t* Data() { - static const uint16_t data[512] = { + static const unsigned short* Data() { + static const unsigned short data[512] = { 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447, 0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456, 0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467, diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h index 3c8aba6f8..5ab4525df 100644 --- a/Eigen/src/Core/products/LookupBlockingSizesTable.h +++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h @@ -25,9 +25,9 @@ struct LookupBlockingSizesFromTableImpl } }; -inline uint8_t floor_log2_helper(uint16_t& x, size_t offset) +inline size_t floor_log2_helper(unsigned short& x, size_t offset) { - uint16_t y = x >> offset; + unsigned short y = x >> offset; if (y) { x = y; return offset; @@ -36,7 +36,7 @@ inline uint8_t floor_log2_helper(uint16_t& x, size_t offset) } } -inline uint8_t floor_log2(uint16_t x) +inline size_t floor_log2(unsigned short x) { return floor_log2_helper(x, 8) + floor_log2_helper(x, 4) @@ -44,7 +44,7 @@ inline uint8_t floor_log2(uint16_t x) + floor_log2_helper(x, 1); } -inline uint8_t ceil_log2(uint16_t x) +inline size_t ceil_log2(unsigned short x) { return x > 1 ? floor_log2(x - 1) + 1 : 0; } @@ -58,16 +58,16 @@ struct LookupBlockingSizesFromTableImpl using std::min; using std::max; typedef BlockingSizesLookupTable Table; - const uint16_t minsize = Table::BaseSize; - const uint16_t maxsize = minsize << (Table::NumSizes - 1); - const uint16_t k_clamped = max(minsize, min(k, maxsize)); - const uint16_t m_clamped = max(minsize, min(m, maxsize)); - const uint16_t n_clamped = max(minsize, min(n, maxsize)); + const unsigned short minsize = Table::BaseSize; + const unsigned short maxsize = minsize << (Table::NumSizes - 1); + const unsigned short k_clamped = max(minsize, min(k, maxsize)); + const unsigned short m_clamped = max(minsize, min(m, maxsize)); + const unsigned short n_clamped = max(minsize, min(n, maxsize)); const size_t k_index = ceil_log2(k_clamped / minsize); const size_t m_index = ceil_log2(m_clamped / minsize); const size_t n_index = ceil_log2(n_clamped / minsize); const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index); - const uint16_t table_entry = Table::Data()[index]; + const unsigned short table_entry = Table::Data()[index]; k = min(k, 1 << ((table_entry & 0xf00) >> 8)); m = min(m, 1 << ((table_entry & 0x0f0) >> 4)); n = min(n, 1 << ((table_entry & 0x00f) >> 0)); diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index 3d57f4bb3..d563a1d2d 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -355,8 +355,8 @@ void dump_table_for_subset( const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1; const size_t TableSize = NumSizes * NumSizes * NumSizes; cout << " static const size_t NumSizes = " << NumSizes << ";" << endl; - cout << " static const uint16_t* Data() {" << endl; - cout << " static const uint16_t data[" << TableSize << "] = {"; + cout << " static const unsigned short* Data() {" << endl; + cout << " static const unsigned short data[" << TableSize << "] = {"; while (entry_index < num_entries) { ++entry_index; if (entry_index == num_entries || From cc0f89eb3b07c65efb2b73890e4e7ac83525700a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 17 Mar 2015 09:57:20 -0700 Subject: [PATCH 72/83] Changed the way lvalue operations are declared in TensorBase: this fixes constness isses that prevented some expressions mixing lvalues and rvalues from compiling. --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 73 ++++++++++++++++--- unsupported/test/cxx11_tensor_const.cpp | 27 ++++++- 2 files changed, 88 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 201b0fc9e..86e72c3a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -520,48 +520,101 @@ class TensorBase : public TensorBase + const TensorLayoutSwapOp swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() { return TensorLayoutSwapOp(derived()); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp(derived(), other, axis); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp - concatenate(const OtherDerived& other, const Axis& axis) const { - return TensorConcatenationOp(derived(), other.derived(), axis); + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp(derived(), other, axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp - reshape(const NewDimensions& newDimensions) const { + reshape(const NewDimensions& newDimensions) { return TensorReshapingOp(derived(), newDimensions); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) const { + slice(const StartIndices& startIndices, const Sizes& sizes) { return TensorSlicingOp(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp + const TensorChippingOp chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) { return TensorChippingOp(derived(), offset, DimId); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp - chip(const Index offset, const Index dim) const { + chip(const Index offset, const Index dim) { return TensorChippingOp(derived(), offset, dim); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp - reverse(const ReverseDimensions& rev) const { + reverse(const ReverseDimensions& rev) { return TensorReverseOp(derived(), rev); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp - shuffle(const Shuffle& shuffle) const { + shuffle(const Shuffle& shuffle) { return TensorShufflingOp(derived(), shuffle); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp - stride(const Strides& strides) const { + stride(const Strides& strides) { return TensorStridingOp(derived(), strides); } diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp index 0ffb02afd..ad9c9da39 100644 --- a/unsupported/test/cxx11_tensor_const.cpp +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -13,8 +13,6 @@ using Eigen::Tensor; - - static void test_simple_assign() { Tensor random(2,3,7); @@ -33,7 +31,32 @@ static void test_simple_assign() } } + +static void test_assign_of_const_tensor() +{ + Tensor random(2,3,7); + random.setRandom(); + + TensorMap > constant1(random.data(), 2, 3, 7); + TensorMap > constant2(random.data(), 2, 3, 7); + const TensorMap > constant3(random.data(), 2, 3, 7); + + Tensor result1 = constant1.chip(0, 2); + Tensor result2 = constant2.chip(0, 2); + Tensor result3 = constant3.chip(0, 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0)); + VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0)); + VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0)); + } + } +} + + void test_cxx11_tensor_const() { CALL_SUBTEST(test_simple_assign()); + CALL_SUBTEST(test_assign_of_const_tensor()); } From f329d0908af35fd17bdc4dfeb87046dcaa6e6937 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 19 Mar 2015 15:10:36 +0100 Subject: [PATCH 73/83] Improve random number generation for integer and add unit test --- Eigen/src/Core/MathFunctions.h | 32 ++++++------- test/CMakeLists.txt | 1 + test/rand.cpp | 88 ++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 18 deletions(-) create mode 100644 test/rand.cpp diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 0fde5c71e..e1b233d82 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -522,28 +522,24 @@ struct meta_floor_log2 template struct random_default_impl { - typedef typename NumTraits::NonInteger NonInteger; - static inline Scalar run(const Scalar& x, const Scalar& y) { using std::max; - Scalar range = (max)(Scalar(0),Scalar(y-x)); - Scalar offset = 0; - if(range<=RAND_MAX) - { - // rejection sampling - int divisor = RAND_MAX/(range+1); + using std::min; + typedef typename conditional::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; + if(y range); - } - else - { - offset = std::rand() * range; - } - - return x + offset; + do { + offset = ( (std::size_t(std::rand()) * multiplier) / divisor ); + } while (offset > range); + + return Scalar(ScalarX(x) + offset); } static inline Scalar run() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1712b8718..734a0eb9b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -139,6 +139,7 @@ endif(TEST_LIB) set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official") add_custom_target(BuildOfficial) +ei_add_test(rand) ei_add_test(meta) ei_add_test(sizeof) ei_add_test(dynalloc) diff --git a/test/rand.cpp b/test/rand.cpp new file mode 100644 index 000000000..4e090cbad --- /dev/null +++ b/test/rand.cpp @@ -0,0 +1,88 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template Scalar check_in_range(Scalar x, Scalar y) +{ + Scalar r = internal::random(x,y); + VERIFY(r>=x); + if(y>=x) + { + VERIFY(r<=y); + } + return r; +} + +template void check_all_in_range(Scalar x, Scalar y) +{ + Array mask(y-x+1); + mask.fill(0); + long n = (y-x+1)*32; + for(long k=0; k0).all() ); +} + +void test_rand() +{ + for(int i = 0; i < g_repeat*10; i++) { + CALL_SUBTEST(check_in_range(10,11)); + CALL_SUBTEST(check_in_range(1.24234523,1.24234523)); + CALL_SUBTEST(check_in_range(-1,1)); + CALL_SUBTEST(check_in_range(-1432.2352,-1432.2352)); + + CALL_SUBTEST(check_in_range(10,11)); + CALL_SUBTEST(check_in_range(1.24234523,1.24234523)); + CALL_SUBTEST(check_in_range(-1,1)); + CALL_SUBTEST(check_in_range(-1432.2352,-1432.2352)); + + + CALL_SUBTEST(check_in_range(0,-1)); + CALL_SUBTEST(check_in_range(0,-1)); + CALL_SUBTEST(check_in_range(0,-1)); + CALL_SUBTEST(check_in_range(-673456,673456)); + CALL_SUBTEST(check_in_range(-24345,24345)); + CALL_SUBTEST(check_in_range(-6734565664234,6734565664234)); + } + + char char_offset = (std::min)(g_repeat,64); + CALL_SUBTEST(check_all_in_range(11,11)); + CALL_SUBTEST(check_all_in_range(11,11+char_offset)); + CALL_SUBTEST(check_all_in_range(-5,5)); + CALL_SUBTEST(check_all_in_range(-11-char_offset,-11)); + CALL_SUBTEST(check_all_in_range(-126,-126+char_offset)); + CALL_SUBTEST(check_all_in_range(126-char_offset,126)); + CALL_SUBTEST(check_all_in_range(-126,126)); + + char short_offset = (std::min)(g_repeat,16000); + CALL_SUBTEST(check_all_in_range(11,11)); + CALL_SUBTEST(check_all_in_range(11,11+short_offset)); + CALL_SUBTEST(check_all_in_range(-5,5)); + CALL_SUBTEST(check_all_in_range(-11-short_offset,-11)); + CALL_SUBTEST(check_all_in_range(-24345,-24345+short_offset)); + CALL_SUBTEST(check_all_in_range(24345,24345+short_offset)); + + + CALL_SUBTEST(check_all_in_range(11,11)); + CALL_SUBTEST(check_all_in_range(11,11+g_repeat)); + CALL_SUBTEST(check_all_in_range(-5,5)); + CALL_SUBTEST(check_all_in_range(-11-g_repeat,-11)); + CALL_SUBTEST(check_all_in_range(-673456,-673456+g_repeat)); + CALL_SUBTEST(check_all_in_range(673456,673456+g_repeat)); + + CALL_SUBTEST(check_all_in_range(11,11)); + CALL_SUBTEST(check_all_in_range(11,11+g_repeat)); + CALL_SUBTEST(check_all_in_range(-5,5)); + CALL_SUBTEST(check_all_in_range(-11-g_repeat,-11)); + CALL_SUBTEST(check_all_in_range(-6734565664234,-6734565664234+g_repeat)); + CALL_SUBTEST(check_all_in_range(6734565664234,6734565664234+g_repeat)); +} From d7698c18b7801f041c36adffcdfaefc99140887f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 19 Mar 2015 15:11:05 +0100 Subject: [PATCH 74/83] Split sparse_basic unit test --- test/CMakeLists.txt | 1 + test/sparse_basic.cpp | 204 +-------------------------------- test/sparse_block.cpp | 254 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+), 200 deletions(-) create mode 100644 test/sparse_block.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 734a0eb9b..393c35b57 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -227,6 +227,7 @@ ei_add_test(stdvector_overload) ei_add_test(stdlist) ei_add_test(stddeque) ei_add_test(sparse_basic) +ei_add_test(sparse_block) ei_add_test(sparse_vector) ei_add_test(sparse_product) ei_add_test(sparse_ref) diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index d929e1463..75f29a2b4 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -30,7 +30,6 @@ template void sparse_basic(const SparseMatrixType& re double density = (std::max)(8./(rows*cols), 0.01); typedef Matrix DenseMatrix; typedef Matrix DenseVector; - typedef Matrix RowDenseVector; Scalar eps = 1e-6; Scalar s1 = internal::random(); @@ -59,77 +58,6 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(m, refMat); - // test InnerIterators and Block expressions - for (int t=0; t<10; ++t) - { - Index j = internal::random(0,cols-2); - Index i = internal::random(0,rows-2); - Index w = internal::random(1,cols-j); - Index h = internal::random(1,rows-i); - - VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w)); - for(Index c=0; c void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(m2,m1); } - // test innerVector() - { - DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); - SparseMatrixType m2(rows, cols); - initSparse(density, refMat2, m2); - Index j0 = internal::random(0,outer-1); - Index j1 = internal::random(0,outer-1); - if(SparseMatrixType::IsRowMajor) - VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0)); - else - VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0)); - - if(SparseMatrixType::IsRowMajor) - VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1)); - else - VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1)); - - SparseMatrixType m3(rows,cols); - m3.reserve(VectorXi::Constant(outer,int(inner/2))); - for(Index j=0; j0) - VERIFY(j==numext::real(m3.innerVector(j).lastCoeff())); - } - m3.makeCompressed(); - for(Index j=0; j<(std::min)(outer, inner); ++j) - { - VERIFY(j==numext::real(m3.innerVector(j).nonZeros())); - if(j>0) - VERIFY(j==numext::real(m3.innerVector(j).lastCoeff())); - } - - VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros()); - -// m2.innerVector(j0) = 2*m2.innerVector(j1); -// refMat2.col(j0) = 2*refMat2.col(j1); -// VERIFY_IS_APPROX(m2, refMat2); - } - - // test innerVectors() - { - DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); - SparseMatrixType m2(rows, cols); - initSparse(density, refMat2, m2); - if(internal::random(0,1)>0.5) m2.makeCompressed(); - Index j0 = internal::random(0,outer-2); - Index j1 = internal::random(0,outer-2); - Index n0 = internal::random(1,outer-(std::max)(j0,j1)); - if(SparseMatrixType::IsRowMajor) - VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols)); - else - VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0)); - if(SparseMatrixType::IsRowMajor) - VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0), - refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0)); - else - VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0), - refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0)); - - VERIFY_IS_APPROX(m2, refMat2); - - VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros()); - - m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0); - if(SparseMatrixType::IsRowMajor) - refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval(); - else - refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval(); - - VERIFY_IS_APPROX(m2, refMat2); - } - // test basic computations { DenseMatrix refM1 = DenseMatrix::Zero(rows, cols); @@ -360,54 +212,6 @@ template void sparse_basic(const SparseMatrixType& re VERIFY(m2.isApprox(m3)); } - - - // test generic blocks - { - DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); - SparseMatrixType m2(rows, cols); - initSparse(density, refMat2, m2); - Index j0 = internal::random(0,outer-2); - Index j1 = internal::random(0,outer-2); - Index n0 = internal::random(1,outer-(std::max)(j0,j1)); - if(SparseMatrixType::IsRowMajor) - VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols)); - else - VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0)); - - if(SparseMatrixType::IsRowMajor) - VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols), - refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols)); - else - VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0), - refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0)); - - Index i = internal::random(0,m2.outerSize()-1); - if(SparseMatrixType::IsRowMajor) { - m2.innerVector(i) = m2.innerVector(i) * s1; - refMat2.row(i) = refMat2.row(i) * s1; - VERIFY_IS_APPROX(m2,refMat2); - } else { - m2.innerVector(i) = m2.innerVector(i) * s1; - refMat2.col(i) = refMat2.col(i) * s1; - VERIFY_IS_APPROX(m2,refMat2); - } - - Index r0 = internal::random(0,rows-2); - Index c0 = internal::random(0,cols-2); - Index r1 = internal::random(1,rows-r0); - Index c1 = internal::random(1,cols-c0); - - VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0)); - VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0)); - - VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0)); - VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0)); - - VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1)); - VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1)); - } - // test prune { SparseMatrixType m2(rows, cols); @@ -646,8 +450,8 @@ void test_sparse_basic() CALL_SUBTEST_2(( sparse_basic(SparseMatrix, ColMajor>(r, c)) )); CALL_SUBTEST_2(( sparse_basic(SparseMatrix, RowMajor>(r, c)) )); CALL_SUBTEST_1(( sparse_basic(SparseMatrix(r, c)) )); - CALL_SUBTEST_1(( sparse_basic(SparseMatrix(r, c)) )); - CALL_SUBTEST_1(( sparse_basic(SparseMatrix(r, c)) )); + CALL_SUBTEST_5(( sparse_basic(SparseMatrix(r, c)) )); + CALL_SUBTEST_5(( sparse_basic(SparseMatrix(r, c)) )); r = Eigen::internal::random(1,100); c = Eigen::internal::random(1,100); @@ -655,8 +459,8 @@ void test_sparse_basic() r = c; // check square matrices in 25% of tries } - CALL_SUBTEST_1(( sparse_basic(SparseMatrix(short(r), short(c))) )); - CALL_SUBTEST_1(( sparse_basic(SparseMatrix(short(r), short(c))) )); + CALL_SUBTEST_6(( sparse_basic(SparseMatrix(short(r), short(c))) )); + CALL_SUBTEST_6(( sparse_basic(SparseMatrix(short(r), short(c))) )); } // Regression test for bug 900: (manually insert higher values here, if you have enough RAM): diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp new file mode 100644 index 000000000..8a6e0687c --- /dev/null +++ b/test/sparse_block.cpp @@ -0,0 +1,254 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "sparse.h" + +template void sparse_block(const SparseMatrixType& ref) +{ + const Index rows = ref.rows(); + const Index cols = ref.cols(); + const Index inner = ref.innerSize(); + const Index outer = ref.outerSize(); + + typedef typename SparseMatrixType::Scalar Scalar; + + double density = (std::max)(8./(rows*cols), 0.01); + typedef Matrix DenseMatrix; + typedef Matrix DenseVector; + typedef Matrix RowDenseVector; + + Scalar s1 = internal::random(); + { + SparseMatrixType m(rows, cols); + DenseMatrix refMat = DenseMatrix::Zero(rows, cols); + initSparse(density, refMat, m); + + VERIFY_IS_APPROX(m, refMat); + + // test InnerIterators and Block expressions + for (int t=0; t<10; ++t) + { + Index j = internal::random(0,cols-2); + Index i = internal::random(0,rows-2); + Index w = internal::random(1,cols-j); + Index h = internal::random(1,rows-i); + + VERIFY_IS_APPROX(m.block(i,j,h,w), refMat.block(i,j,h,w)); + for(Index c=0; c(density, refMat2, m2); + Index j0 = internal::random(0,outer-1); + Index j1 = internal::random(0,outer-1); + if(SparseMatrixType::IsRowMajor) + VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.row(j0)); + else + VERIFY_IS_APPROX(m2.innerVector(j0), refMat2.col(j0)); + + if(SparseMatrixType::IsRowMajor) + VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.row(j0)+refMat2.row(j1)); + else + VERIFY_IS_APPROX(m2.innerVector(j0)+m2.innerVector(j1), refMat2.col(j0)+refMat2.col(j1)); + + SparseMatrixType m3(rows,cols); + m3.reserve(VectorXi::Constant(outer,int(inner/2))); + for(Index j=0; j0) + VERIFY(j==numext::real(m3.innerVector(j).lastCoeff())); + } + m3.makeCompressed(); + for(Index j=0; j<(std::min)(outer, inner); ++j) + { + VERIFY(j==numext::real(m3.innerVector(j).nonZeros())); + if(j>0) + VERIFY(j==numext::real(m3.innerVector(j).lastCoeff())); + } + + VERIFY(m3.innerVector(j0).nonZeros() == m3.transpose().innerVector(j0).nonZeros()); + +// m2.innerVector(j0) = 2*m2.innerVector(j1); +// refMat2.col(j0) = 2*refMat2.col(j1); +// VERIFY_IS_APPROX(m2, refMat2); + } + + // test innerVectors() + { + DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); + SparseMatrixType m2(rows, cols); + initSparse(density, refMat2, m2); + if(internal::random(0,1)>0.5) m2.makeCompressed(); + Index j0 = internal::random(0,outer-2); + Index j1 = internal::random(0,outer-2); + Index n0 = internal::random(1,outer-(std::max)(j0,j1)); + if(SparseMatrixType::IsRowMajor) + VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(j0,0,n0,cols)); + else + VERIFY_IS_APPROX(m2.innerVectors(j0,n0), refMat2.block(0,j0,rows,n0)); + if(SparseMatrixType::IsRowMajor) + VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0), + refMat2.middleRows(j0,n0)+refMat2.middleRows(j1,n0)); + else + VERIFY_IS_APPROX(m2.innerVectors(j0,n0)+m2.innerVectors(j1,n0), + refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0)); + + VERIFY_IS_APPROX(m2, refMat2); + + VERIFY(m2.innerVectors(j0,n0).nonZeros() == m2.transpose().innerVectors(j0,n0).nonZeros()); + + m2.innerVectors(j0,n0) = m2.innerVectors(j0,n0) + m2.innerVectors(j1,n0); + if(SparseMatrixType::IsRowMajor) + refMat2.middleRows(j0,n0) = (refMat2.middleRows(j0,n0) + refMat2.middleRows(j1,n0)).eval(); + else + refMat2.middleCols(j0,n0) = (refMat2.middleCols(j0,n0) + refMat2.middleCols(j1,n0)).eval(); + + VERIFY_IS_APPROX(m2, refMat2); + } + + // test generic blocks + { + DenseMatrix refMat2 = DenseMatrix::Zero(rows, cols); + SparseMatrixType m2(rows, cols); + initSparse(density, refMat2, m2); + Index j0 = internal::random(0,outer-2); + Index j1 = internal::random(0,outer-2); + Index n0 = internal::random(1,outer-(std::max)(j0,j1)); + if(SparseMatrixType::IsRowMajor) + VERIFY_IS_APPROX(m2.block(j0,0,n0,cols), refMat2.block(j0,0,n0,cols)); + else + VERIFY_IS_APPROX(m2.block(0,j0,rows,n0), refMat2.block(0,j0,rows,n0)); + + if(SparseMatrixType::IsRowMajor) + VERIFY_IS_APPROX(m2.block(j0,0,n0,cols)+m2.block(j1,0,n0,cols), + refMat2.block(j0,0,n0,cols)+refMat2.block(j1,0,n0,cols)); + else + VERIFY_IS_APPROX(m2.block(0,j0,rows,n0)+m2.block(0,j1,rows,n0), + refMat2.block(0,j0,rows,n0)+refMat2.block(0,j1,rows,n0)); + + Index i = internal::random(0,m2.outerSize()-1); + if(SparseMatrixType::IsRowMajor) { + m2.innerVector(i) = m2.innerVector(i) * s1; + refMat2.row(i) = refMat2.row(i) * s1; + VERIFY_IS_APPROX(m2,refMat2); + } else { + m2.innerVector(i) = m2.innerVector(i) * s1; + refMat2.col(i) = refMat2.col(i) * s1; + VERIFY_IS_APPROX(m2,refMat2); + } + + Index r0 = internal::random(0,rows-2); + Index c0 = internal::random(0,cols-2); + Index r1 = internal::random(1,rows-r0); + Index c1 = internal::random(1,cols-c0); + + VERIFY_IS_APPROX(DenseVector(m2.col(c0)), refMat2.col(c0)); + VERIFY_IS_APPROX(m2.col(c0), refMat2.col(c0)); + + VERIFY_IS_APPROX(RowDenseVector(m2.row(r0)), refMat2.row(r0)); + VERIFY_IS_APPROX(m2.row(r0), refMat2.row(r0)); + + VERIFY_IS_APPROX(m2.block(r0,c0,r1,c1), refMat2.block(r0,c0,r1,c1)); + VERIFY_IS_APPROX((2*m2).block(r0,c0,r1,c1), (2*refMat2).block(r0,c0,r1,c1)); + } +} + +void test_sparse_block() +{ + for(int i = 0; i < g_repeat; i++) { + int r = Eigen::internal::random(1,200), c = Eigen::internal::random(1,200); + if(Eigen::internal::random(0,4) == 0) { + r = c; // check square matrices in 25% of tries + } + EIGEN_UNUSED_VARIABLE(r+c); + CALL_SUBTEST_1(( sparse_block(SparseMatrix(1, 1)) )); + CALL_SUBTEST_1(( sparse_block(SparseMatrix(8, 8)) )); + CALL_SUBTEST_1(( sparse_block(SparseMatrix(r, c)) )); + CALL_SUBTEST_2(( sparse_block(SparseMatrix, ColMajor>(r, c)) )); + CALL_SUBTEST_2(( sparse_block(SparseMatrix, RowMajor>(r, c)) )); + + CALL_SUBTEST_3(( sparse_block(SparseMatrix(r, c)) )); + CALL_SUBTEST_3(( sparse_block(SparseMatrix(r, c)) )); + + r = Eigen::internal::random(1,100); + c = Eigen::internal::random(1,100); + if(Eigen::internal::random(0,4) == 0) { + r = c; // check square matrices in 25% of tries + } + + CALL_SUBTEST_4(( sparse_block(SparseMatrix(short(r), short(c))) )); + CALL_SUBTEST_4(( sparse_block(SparseMatrix(short(r), short(c))) )); + } +} From 61c45d7cfd29a161abefd095107b72c75db76ffc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 19 Mar 2015 17:13:22 +0100 Subject: [PATCH 75/83] Fix comparison warning --- Eigen/src/Core/DenseStorage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 522aaa299..ab41641f4 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -36,8 +36,8 @@ void check_static_allocation_size() template::type, bool Match = bool((Size%unpacket_traits::size)==0), - bool TryHalf = bool(unpacket_traits::size > Size) - && bool(unpacket_traits::size > unpacket_traits::half>::size) > + bool TryHalf = bool(int(unpacket_traits::size) > Size) + && bool(int(unpacket_traits::size) > int(unpacket_traits::half>::size)) > struct compute_default_alignment { enum { value = 0 }; From d6b2f300dbab3f11f6f8bd28558d2b91758ad514 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 19 Mar 2015 17:28:32 +0100 Subject: [PATCH 76/83] Fix MSVC compilation: aligned type must be passed by reference --- Eigen/src/Core/util/BlasUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 9bfa45106..ffeb5ac5f 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -214,7 +214,7 @@ class blas_data_mapper { } template - EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const { + EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); } From 9ee62fdcd51e78a7e5f53c5868680fe0b6b261e4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 19 Mar 2015 21:39:37 +0100 Subject: [PATCH 77/83] Fix random unit test for 32bits systems. --- test/rand.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/rand.cpp b/test/rand.cpp index 4e090cbad..7c8068a3b 100644 --- a/test/rand.cpp +++ b/test/rand.cpp @@ -34,6 +34,10 @@ template void check_all_in_range(Scalar x, Scalar y) void test_rand() { + long long_ref = NumTraits::highest()/10; + char char_offset = (std::min)(g_repeat,64); + char short_offset = (std::min)(g_repeat,16000); + for(int i = 0; i < g_repeat*10; i++) { CALL_SUBTEST(check_in_range(10,11)); CALL_SUBTEST(check_in_range(1.24234523,1.24234523)); @@ -45,16 +49,14 @@ void test_rand() CALL_SUBTEST(check_in_range(-1,1)); CALL_SUBTEST(check_in_range(-1432.2352,-1432.2352)); - CALL_SUBTEST(check_in_range(0,-1)); CALL_SUBTEST(check_in_range(0,-1)); CALL_SUBTEST(check_in_range(0,-1)); CALL_SUBTEST(check_in_range(-673456,673456)); CALL_SUBTEST(check_in_range(-24345,24345)); - CALL_SUBTEST(check_in_range(-6734565664234,6734565664234)); + CALL_SUBTEST(check_in_range(-long_ref,long_ref)); } - char char_offset = (std::min)(g_repeat,64); CALL_SUBTEST(check_all_in_range(11,11)); CALL_SUBTEST(check_all_in_range(11,11+char_offset)); CALL_SUBTEST(check_all_in_range(-5,5)); @@ -63,7 +65,6 @@ void test_rand() CALL_SUBTEST(check_all_in_range(126-char_offset,126)); CALL_SUBTEST(check_all_in_range(-126,126)); - char short_offset = (std::min)(g_repeat,16000); CALL_SUBTEST(check_all_in_range(11,11)); CALL_SUBTEST(check_all_in_range(11,11+short_offset)); CALL_SUBTEST(check_all_in_range(-5,5)); @@ -71,7 +72,6 @@ void test_rand() CALL_SUBTEST(check_all_in_range(-24345,-24345+short_offset)); CALL_SUBTEST(check_all_in_range(24345,24345+short_offset)); - CALL_SUBTEST(check_all_in_range(11,11)); CALL_SUBTEST(check_all_in_range(11,11+g_repeat)); CALL_SUBTEST(check_all_in_range(-5,5)); @@ -83,6 +83,6 @@ void test_rand() CALL_SUBTEST(check_all_in_range(11,11+g_repeat)); CALL_SUBTEST(check_all_in_range(-5,5)); CALL_SUBTEST(check_all_in_range(-11-g_repeat,-11)); - CALL_SUBTEST(check_all_in_range(-6734565664234,-6734565664234+g_repeat)); - CALL_SUBTEST(check_all_in_range(6734565664234,6734565664234+g_repeat)); + CALL_SUBTEST(check_all_in_range(-long_ref,-long_ref+g_repeat)); + CALL_SUBTEST(check_all_in_range( long_ref, long_ref+g_repeat)); } From e134226a0352b0951b03e8ea56ce3bbd538e73db Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 19 Mar 2015 23:11:42 -0700 Subject: [PATCH 78/83] Fixed a bug in the handling of packets by the MeanReducer --- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 38586d067..25f085a59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -77,7 +77,7 @@ template struct MeanReducer } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits::size); + return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); } protected: From a6a628ca6b3c0d0dd6716d200ba8e7740847168a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 19 Mar 2015 23:22:19 -0700 Subject: [PATCH 79/83] Added the -= operator to the device classes --- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 649bdb308..7a67c56b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -21,8 +21,7 @@ namespace Eigen { * Example: * C.device(EIGEN_GPU) = A + B; * - * Todo: thread pools. - * Todo: operator +=, -=, *= and so on. + * Todo: operator *= and /=. */ template class TensorDevice { @@ -50,6 +49,18 @@ template class TensorDevice { return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const DeviceType& m_device; ExpressionType& m_expression; @@ -82,6 +93,18 @@ template class TensorDevice + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const ThreadPoolDevice& m_device; ExpressionType& m_expression; @@ -114,6 +137,18 @@ template class TensorDevice return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const GpuDevice& m_device; ExpressionType m_expression; From 8d9bfb3a7ba03a8eb12a71d7133217d768f7940a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 20 Mar 2015 16:00:10 +0100 Subject: [PATCH 80/83] fix loadMarket wrt Index versus int --- unsupported/Eigen/src/SparseExtra/MarketIO.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h index 25ff4228d..100e617b2 100644 --- a/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h @@ -18,7 +18,7 @@ namespace Eigen { namespace internal { template - inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value) + inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value) { line >> i >> j >> value; i--; @@ -31,7 +31,7 @@ namespace internal return false; } template - inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex& value) + inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex& value) { Scalar valR, valI; line >> i >> j >> valR >> valI; From 4e2b18d909f27f03f64c5fd8891f2e94c5e03802 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 20 Mar 2015 16:33:48 +0100 Subject: [PATCH 81/83] Update approx. minimum ordering method to push and keep structural empty diagonal elements to the bottom-right part of the matrix --- Eigen/src/OrderingMethods/Amd.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Eigen/src/OrderingMethods/Amd.h b/Eigen/src/OrderingMethods/Amd.h index 3d2981f0c..63d996cb4 100644 --- a/Eigen/src/OrderingMethods/Amd.h +++ b/Eigen/src/OrderingMethods/Amd.h @@ -137,22 +137,27 @@ void minimum_degree_ordering(SparseMatrix& C, Perm degree[i] = len[i]; // degree of node i } mark = internal::cs_wclear(0, 0, w, n); /* clear w */ - elen[n] = -2; /* n is a dead element */ - Cp[n] = -1; /* n is a root of assembly tree */ - w[n] = 0; /* n is a dead element */ /* --- Initialize degree lists ------------------------------------------ */ for(i = 0; i < n; i++) { + bool has_diag = false; + for(p = Cp[i]; p dense) /* node i is dense */ + else if(d > dense || !has_diag) /* node i is dense or has no structural diagonal element */ { nv[i] = 0; /* absorb i into element n */ elen[i] = -1; /* node i is dead */ @@ -168,6 +173,10 @@ void minimum_degree_ordering(SparseMatrix& C, Perm } } + elen[n] = -2; /* n is a dead element */ + Cp[n] = -1; /* n is a root of assembly tree */ + w[n] = 0; /* n is a dead element */ + while (nel < n) /* while (selecting pivots) do */ { /* --- Select node of minimum approximate degree -------------------- */ From 4472f3e57884202b52c551da0b294c2883e2ccbf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 23 Mar 2015 09:40:21 +0100 Subject: [PATCH 82/83] Avoid SVD: consider denormalized small numbers as zero when computing the rank of the matrix --- Eigen/src/SVD/SVDBase.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index b89393721..ad191085e 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -130,9 +130,10 @@ public: inline Index rank() const { using std::abs; + using std::max; eigen_assert(m_isInitialized && "JacobiSVD is not initialized."); if(m_singularValues.size()==0) return 0; - RealScalar premultiplied_threshold = m_singularValues.coeff(0) * threshold(); + RealScalar premultiplied_threshold = (max)(m_singularValues.coeff(0) * threshold(), (std::numeric_limits::min)()); Index i = m_nonzeroSingularValues-1; while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i; return i+1; From abdbe8562e889a0ca0877d607cfd5c4cbf937e3a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 24 Mar 2015 10:45:46 -0700 Subject: [PATCH 83/83] Fixed the CUDA packet primitives --- Eigen/src/Core/arch/CUDA/PacketMath.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 19749c832..ceed1d1ef 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -197,21 +197,21 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(cons } #endif -template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, int stride) { +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); } -template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, int stride) { +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { return make_double2(from[0*stride], from[1*stride]); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, int stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { to[stride*0] = from.x; to[stride*1] = from.y; to[stride*2] = from.z; to[stride*3] = from.w; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, int stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { to[stride*0] = from.x; to[stride*1] = from.y; } @@ -245,14 +245,14 @@ template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) } template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { - return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { - return make_double2(abs(a.x), abs(a.y)); + return make_double2(fabs(a.x), fabs(a.y)); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; @@ -279,7 +279,7 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].z = tmp; } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x;