diff --git a/CMakeLists.txt b/CMakeLists.txt index a4731c8c1..a719e47fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,18 @@ if(NOT MSVC) message(STATUS "Enabling SSE4.2 in tests/examples") endif() + option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) + if(EIGEN_TEST_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + message(STATUS "Enabling AVX in tests/examples") + endif() + + option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF) + if(EIGEN_TEST_FMA) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") + message(STATUS "Enabling FMA in tests/examples") + endif() + option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) if(EIGEN_TEST_ALTIVEC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") @@ -437,12 +449,12 @@ set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} ) set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} ) set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} ) -configure_file ( ${CMAKE_SOURCE_DIR}/cmake/Eigen3Config.cmake.in +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake @ONLY ESCAPE_QUOTES ) -install ( FILES ${CMAKE_SOURCE_DIR}/cmake/UseEigen3.cmake +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake DESTINATION ${EIGEN_CONFIG_CMAKE_PATH} ) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index f727f5d89..7314d326c 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -10,9 +10,11 @@ * * * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices. - * Those decompositions are accessible via the following MatrixBase methods: - * - MatrixBase::llt(), + * Those decompositions are also accessible via the following methods: + * - MatrixBase::llt() * - MatrixBase::ldlt() + * - SelfAdjointView::llt() + * - SelfAdjointView::ldlt() * * \code * #include diff --git a/Eigen/Core b/Eigen/Core index 6d5571875..82dc30c62 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -117,7 +117,16 @@ #ifdef __SSE4_2__ #define EIGEN_VECTORIZE_SSE4_2 #endif - + #ifdef __AVX__ + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __FMA__ + #define EIGEN_VECTORIZE_FMA + #endif // include files // This extern "C" works around a MINGW-w64 compilation issue @@ -130,7 +139,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #ifdef __INTEL_COMPILER + #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110 #include #else #include @@ -147,6 +156,9 @@ #ifdef EIGEN_VECTORIZE_SSE4_2 #include #endif + #ifdef EIGEN_VECTORIZE_AVX + #include + #endif #endif } // end extern "C" #elif defined __ALTIVEC__ @@ -216,7 +228,9 @@ namespace Eigen { inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_SSE4_2) +#if defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; #elif defined(EIGEN_VECTORIZE_SSE4_1) return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; @@ -294,7 +308,13 @@ using std::ptrdiff_t; #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" -#if defined EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_AVX + // Use AVX for floats and doubles, SSE for integers + #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/Complex.h" + #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/Complex.h" +#elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" diff --git a/Eigen/Geometry b/Eigen/Geometry index efd9d4504..f9bc6fc57 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -47,7 +47,9 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" - #if defined EIGEN_VECTORIZE_SSE + // Use the SSE optimized version whenever possible. At the moment the + // SSE version doesn't compile when AVX is enabled + #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/Geometry/arch/Geometry_SSE.h" #endif #endif diff --git a/Eigen/LU b/Eigen/LU index fac80024f..498faee10 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -27,7 +27,9 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -#if defined EIGEN_VECTORIZE_SSE +// Use the SSE optimized version whenever possible. At the moment the +// SSE version doesn't compile when AVX is enabled +#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/LU/arch/Inverse_SSE.h" #endif diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index d66110821..11dd5624c 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -43,7 +43,7 @@ namespace internal { * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky * decomposition to determine whether a system of equations has a solution. * - * \sa MatrixBase::ldlt(), class LLT + * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT { @@ -179,7 +179,7 @@ template class LDLT * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. * - * \sa MatrixBase::ldlt() + * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt() */ #ifdef EIGEN_TEST_EVALUATORS template @@ -610,6 +610,7 @@ MatrixType LDLT::reconstructedMatrix() const #ifndef __CUDACC__ /** \cholesky_module * \returns the Cholesky decomposition with full pivoting without square root of \c *this + * \sa MatrixBase::ldlt() */ template inline const LDLT::PlainObject, UpLo> @@ -620,6 +621,7 @@ SelfAdjointView::ldlt() const /** \cholesky_module * \returns the Cholesky decomposition with full pivoting without square root of \c *this + * \sa SelfAdjointView::ldlt() */ template inline const LDLT::PlainObject> diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index d9a8ef1fb..083b820f5 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -41,7 +41,7 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * - * \sa MatrixBase::llt(), class LDLT + * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, @@ -115,7 +115,7 @@ template class LLT * Example: \include LLT_solve.cpp * Output: \verbinclude LLT_solve.out * - * \sa solveInPlace(), MatrixBase::llt() + * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt() */ #ifdef EIGEN_TEST_EVALUATORS template @@ -498,6 +498,7 @@ MatrixType LLT::reconstructedMatrix() const #ifndef __CUDACC__ /** \cholesky_module * \returns the LLT decomposition of \c *this + * \sa SelfAdjointView::llt() */ template inline const LLT::PlainObject> @@ -508,6 +509,7 @@ MatrixBase::llt() const /** \cholesky_module * \returns the LLT decomposition of \c *this + * \sa SelfAdjointView::llt() */ template inline const LLT::PlainObject, UpLo> diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 0e10f125f..abbba84c3 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -105,6 +105,8 @@ public: EIGEN_DEBUG_VAR(DstIsAligned) EIGEN_DEBUG_VAR(SrcIsAligned) EIGEN_DEBUG_VAR(JointAlignment) + EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime) + EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(PacketSize) diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 7772951b9..97134ffd7 100644 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h @@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan) +EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan, Atan) //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index ef6c143d3..70e457028 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -88,8 +88,8 @@ struct traits > : traits::size) == 0) && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, - FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, Flags0 = traits::Flags & ( (HereditaryBits & ~RowMajorBit) | diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index b9dd75ade..59f515495 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -40,7 +40,7 @@ void check_static_allocation_size() */ template struct plain_array { @@ -68,27 +68,27 @@ struct plain_array template EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; } #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((reinterpret_cast(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \ + eigen_assert((reinterpret_cast(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #else #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ - eigen_assert((reinterpret_cast(array) & sizemask) == 0 \ + eigen_assert((reinterpret_cast(array) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #endif template -struct plain_array +struct plain_array { - EIGEN_USER_ALIGN16 T array[Size]; + EIGEN_USER_ALIGN_DEFAULT T array[Size]; EIGEN_DEVICE_FUNC plain_array() { - EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf); + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1); check_static_allocation_size(); } @@ -102,7 +102,7 @@ struct plain_array template struct plain_array { - EIGEN_USER_ALIGN16 T array[1]; + EIGEN_USER_ALIGN_DEFAULT T array[1]; EIGEN_DEVICE_FUNC plain_array() {} EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index f28d2e46c..734815986 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -69,8 +69,7 @@ template struct product_type MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime, _Rhs::MaxRowsAtCompileTime), Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, - _Rhs::RowsAtCompileTime), - LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD + _Rhs::RowsAtCompileTime) }; // the splitting into different lines of code here, introducing the _select enums and the typedef below, @@ -424,7 +423,7 @@ struct gemv_static_vector_if internal::plain_array m_data; EIGEN_STRONG_INLINE Scalar* data() { return ForceAlignment - ? reinterpret_cast((reinterpret_cast(m_data.array) & ~(size_t(15))) + 16) + ? reinterpret_cast((reinterpret_cast(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) : m_data.array; } #endif diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h old mode 100644 new mode 100755 index b0469fa1e..4523d2263 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -42,6 +42,8 @@ namespace internal { struct default_packet_traits { enum { + HasHalfPacket = 0, + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -71,10 +73,12 @@ struct default_packet_traits template struct packet_traits : default_packet_traits { typedef T type; + typedef T half; enum { Vectorizable = 0, size = 1, - AlignedOnScalar = 0 + AlignedOnScalar = 0, + HasHalfPacket = 0 }; enum { HasAdd = 0, @@ -157,18 +161,66 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet with elements of \a *from duplicated. - * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and - * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]} - * Currently, this function is only used for scalar * complex products. - */ -template EIGEN_DEVICE_FUNC inline Packet -ploaddup(const typename unpacket_traits::type* from) { return *from; } - /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ +template EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits::type *a) { return pset1(*a); } + +/** \internal \returns a packet with elements of \a *from duplicated. + * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and + * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} + * Currently, this function is only used for scalar * complex products. + */ +template EIGEN_DEVICE_FUNC inline Packet +ploaddup(const typename unpacket_traits::type* from) { return *from; } + +/** \internal \returns a packet with elements of \a *from quadrupled. + * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and + * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} + * Currently, this function is only used in matrix products. + * For packet-size smaller or equal to 4, this function is equivalent to pload1 + */ +template EIGEN_DEVICE_FUNC inline Packet +ploadquad(const typename unpacket_traits::type* from) +{ return pload1(from); } + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * a2 = pload1(a+2); + * a3 = pload1(a+3); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast2 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast4(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); + a2 = pload1(a+2); + a3 = pload1(a+3); +} + +/** \internal equivalent to + * \code + * a0 = pload1(a+0); + * a1 = pload1(a+1); + * \endcode + * \sa pset1, pload1, ploaddup, pbroadcast4 + */ +template EIGEN_DEVICE_FUNC +inline void pbroadcast2(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); +} + /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ template inline typename packet_traits::type plset(const Scalar& a) { return a; } @@ -179,7 +231,13 @@ template EIGEN_DEVICE_FUNC inline void pstore( /** \internal copy the packet \a from to \a *to, (un-aligned store) */ template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) -{ (*to) = from; } +{ (*to) = from; } + + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/) + { return ploadu(from); } + + template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/) + { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ template inline void prefetch(const Scalar* addr) @@ -201,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) { return a; } +/** \internal \returns the sum of the elements of \a a by block of 4 elements. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template EIGEN_DEVICE_FUNC inline +typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux4(const Packet& a) +{ return a; } + /** \internal \returns the product of the elements of \a a*/ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) { return a; } @@ -251,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos(const Packet& a) { using std::acos; return acos(a); } +/** \internal \returns the atan of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan(const Packet& a) { using std::atan; return atan(a); } + /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) { using std::exp; return exp(a); } @@ -348,9 +419,22 @@ template<> inline std::complex pmul(const std::complex& a, const #endif + +/*************************************************************************** + * PacketBlock, that is a collection of N packets where the number of words + * in the packet is a multiple of N. +***************************************************************************/ +template ::size> struct PacketBlock { + Packet packet[N]; +}; + +template EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& /*kernel*/) { + // Nothing to do in the scalar case, i.e. a 1x1 matrix. +} + } // end namespace internal } // end namespace Eigen #endif // EIGEN_GENERIC_PACKET_MATH_H - diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 2acf97723..2067a2a6e 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -45,6 +45,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 23bbb46bf..6b2d61909 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -89,7 +89,7 @@ struct traits > && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ), + && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index de1424b09..927a59c50 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -166,11 +166,10 @@ template class MapBase EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits::Flags&PacketAccessBit, internal::inner_stride_at_compile_time::ret==1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); - eigen_assert(EIGEN_IMPLIES(internal::traits::Flags&AlignedBit, (size_t(m_data) % 16) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(internal::traits::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); #else - eigen_assert(EIGEN_IMPLIES(internal::traits::IsAligned, (size_t(m_data) % 16) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(internal::traits::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); #endif - } PointerType m_data; @@ -255,6 +254,8 @@ template class MapBase using Base::Base::operator=; }; +#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS + } // end namespace Eigen #endif // EIGEN_MAPBASE_H diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 63fb92b75..20fc2be74 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -669,6 +669,15 @@ bool (isfinite)(const T& x) return x::highest() && x>NumTraits::lowest(); } +template +EIGEN_DEVICE_FUNC +bool (isfinite)(const std::complex& x) +{ + using std::real; + using std::imag; + return isfinite(real(x)) && isfinite(imag(x)); +} + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 1e2270d34..136cd165e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -325,10 +325,16 @@ struct redux_impl }; static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); - if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); - return res; + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + if (VectorizedSize > 0) { + Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + if (VectorizedSize != Size) + res = func(res,redux_novec_unroller::run(mat,func)); + return res; + } + else { + return redux_novec_unroller::run(mat,func); + } } }; diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 665b7497a..fe493641d 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -97,7 +97,7 @@ struct traits > template struct match { enum { HasDirectAccess = internal::has_direct_access::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), @@ -168,8 +168,12 @@ protected: } else ::new (static_cast(this)) Base(expr.data(), expr.rows(), expr.cols()); - ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), - StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); + + if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit))) + ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1); + else + ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), + StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); } StrideBase m_stride; diff --git a/Eigen/src/Core/arch/AVX/CMakeLists.txt b/Eigen/src/Core/arch/AVX/CMakeLists.txt new file mode 100644 index 000000000..bdb71ab99 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt @@ -0,0 +1,6 @@ +FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h") + +INSTALL(FILES + ${Eigen_Core_arch_AVX_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel +) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h new file mode 100644 index 000000000..9ced85132 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -0,0 +1,463 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX_H +#define EIGEN_COMPLEX_AVX_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet4cf +{ + EIGEN_STRONG_INLINE Packet4cf() {} + EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} + __m256 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=4}; typedef Packet2cf half; }; + +template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) +{ + return Packet4cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) +{ + const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet4cf(_mm256_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) +{ + __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); + __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + __m256 result = _mm256_addsub_ps(tmp1, tmp2); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) +{ + return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) +{ + // FIXME The following might be optimized using _mm256_movedup_pd + Packet2cf a = ploaddup(from); + Packet2cf b = ploaddup(from+1); + return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather, Packet4cf>(const std::complex* from, int stride) +{ + return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cf>(std::complex* to, const Packet4cf& from, int stride) +{ + __m128 low = _mm256_extractf128_ps(from.v, 0); + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + + __m128 high = _mm256_extractf128_ps(from.v, 1); + to[stride*2] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + to[stride*3] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); + +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) +{ + return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m128 low = _mm256_extractf128_ps(a.v, 0); + __m128 high = _mm256_extractf128_ps(a.v, 1); + __m128d lowd = _mm_castps_pd(low); + __m128d highd = _mm_castps_pd(high); + low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); + high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); + __m256 result = _mm256_setzero_ps(); + result = _mm256_insertf128_ps(result, low, 1); + result = _mm256_insertf128_ps(result, high, 0); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cf& a) +{ + return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)), + Packet2cf(_mm256_extractf128_ps(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) +{ + Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t0 = _mm256_hadd_ps(t0,t1); + Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t2 = _mm256_hadd_ps(t2,t3); + + t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); + t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); + + return Packet4cf(_mm256_add_ps(t1,t3)); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) +{ + return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), + Packet2cf(_mm256_extractf128_ps(a.v, 1)))); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const + { return Packet4cf(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const + { return Packet4cf(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) +{ + Packet4cf num = pmul(a, pconj(b)); + __m256 tmp = _mm256_mul_ps(b.v, b.v); + __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); + __m256 denom = _mm256_add_ps(tmp, tmp2); + return Packet4cf(_mm256_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) +{ + return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet2cd +{ + EIGEN_STRONG_INLINE Packet2cd() {} + EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} + __m256d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 2, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) +{ + const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet2cd(_mm256_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) +{ + __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0); + __m256d even = _mm256_mul_pd(tmp1, b.v); + __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF); + __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5); + __m256d odd = _mm256_mul_pd(tmp2, tmp3); + return Packet2cd(_mm256_addsub_pd(even, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) +{ + // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though) +// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from)); + return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather, Packet2cd>(const std::complex* from, int stride) +{ + return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cd>(std::complex* to, const Packet2cd& from, int stride) +{ + __m128d low = _mm256_extractf128_pd(from.v, 0); + to[stride*0] = std::complex(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1))); + __m128d high = _mm256_extractf128_pd(from.v, 1); + to[stride*1] = std::complex(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1))); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) +{ + __m128d low = _mm256_extractf128_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { + __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1); + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cd& a) +{ + return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) +{ + Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); + Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); + + return Packet2cd(_mm256_add_pd(t0,t1)); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) +{ + return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const + { return Packet2cd(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const + { return Packet2cd(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) +{ + Packet2cd num = pmul(a, pconj(b)); + __m256d tmp = _mm256_mul_pd(b.v, b.v); + __m256d denom = _mm256_hadd_pd(tmp, tmp); + return Packet2cd(_mm256_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) +{ + return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); + __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); + __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); + __m256d P3 = _mm256_castps_pd(kernel.packet[3].v); + + __m256d T0 = _mm256_shuffle_pd(P0, P1, 15); + __m256d T1 = _mm256_shuffle_pd(P0, P1, 0); + __m256d T2 = _mm256_shuffle_pd(P2, P3, 15); + __m256d T3 = _mm256_shuffle_pd(P2, P3, 0); + + kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32)); + kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49)); + kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32)); + kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); + kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); + kernel.packet[0].v = tmp; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX_H diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h new file mode 100644 index 000000000..8b8307d75 --- /dev/null +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -0,0 +1,564 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif + +#ifdef EIGEN_VECTORIZE_FMA +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif +#endif + +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; + +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; + +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1(X) + + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 1, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; + }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 1, + + HasDiv = 1, + HasExp = 0 + }; +}; + +/* Proper support for integers is only provided by AVX2. In the meantime, we'll + use SSE instructions and packets to deal with integers. +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8i type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8 + }; +}; +*/ + +template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8}; }; +template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8}; }; + +template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } + +template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } + +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } + +template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, const Packet8i& /*b*/) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1(0); +} + +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if defined(__clang__) || defined(__GNUC__) + // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and gcc stupidly generates a vfmadd132ps instruction, + // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate + // the result of the product. + Packet8f res = c; + asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if defined(__clang__) || defined(__GNUC__) + // see above + Packet4d res = c; + asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} +#endif + +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } + +// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} +template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) +{ + // TODO try to find a way to avoid the need of a temporary register +// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); +// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); +// return _mm256_unpacklo_ps(tmp,tmp); + + // _mm256_insertf128_ps is very slow on Haswell, thus: + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); +} +// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) +{ + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); +} + +// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); +template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, int stride) +{ + return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const double* from, int stride) +{ + return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet8f& from, int stride) +{ + __m128 low = _mm256_extractf128_ps(from, 0); + to[stride*0] = _mm_cvtss_f32(low); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); + + __m128 high = _mm256_extractf128_ps(from, 1); + to[stride*4] = _mm_cvtss_f32(high); + to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); + to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); + to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4d& from, int stride) +{ + __m128d low = _mm256_extractf128_pd(from, 0); + to[stride*0] = _mm_cvtsd_f64(low); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); + __m128d high = _mm256_extractf128_pd(from, 1); + to[stride*2] = _mm_cvtsd_f64(high); + to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet8f pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet4d pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) +{ + Packet8i pa = pset1(a); + pstore(to, pa); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} + + +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} + +// pabs should be ok +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} + +// preduxp should be ok +// FIXME: why is this ok? why isn't the simply implementation working as expected? +template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) +{ + Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1)); + tmp0 = _mm256_hadd_ps(tmp0,tmp0); + return pfirst(_mm256_hadd_ps(tmp0, tmp0)); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) +{ + Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_hadd_pd(tmp0,tmp0)); +} + +template<> EIGEN_STRONG_INLINE Packet4f predux4(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) +{ + Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) +{ + Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) +{ + Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) +{ + Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} + + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + + kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + + kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); + kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); + kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); + kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_AVX_H diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 68d9a2bff..5409ddedd 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -21,6 +21,8 @@ static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8); static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; //---------- float ---------- struct Packet2cf @@ -33,6 +35,7 @@ struct Packet2cf template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -51,7 +54,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -65,6 +68,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return Packet2cf(vec_ld(0, (const float*)af)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + vec_st(from.v, 0, (float*)af); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + + template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } @@ -210,6 +229,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); } +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1); + kernel.packet[0].v = tmp; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 45d1954f7..0e9adf450 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,6 +18,10 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif + #ifndef EIGEN_HAS_FUSE_CJMADD #define EIGEN_HAS_FUSE_CJMADD 1 #endif @@ -73,6 +77,7 @@ static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui) template<> struct packet_traits : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -89,6 +94,7 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { // FIXME check the Has* Vectorizable = 1, @@ -97,8 +103,8 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4}; typedef Packet4i half; }; /* inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { @@ -144,6 +150,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) return s; } */ + template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html float EIGEN_ALIGN16 af[4]; @@ -161,6 +168,65 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vc; } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = vec_ld(0,a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = vec_ld(0,a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return vec_ld(0, af); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return vec_ld(0, ai); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + float EIGEN_ALIGN16 af[4]; + vec_st(from, 0, af); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + int EIGEN_ALIGN16 ai[4]; + vec_st(from, 0, ai); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return vec_add(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return vec_add(pset1(a), p4i_COUNTDOWN); } @@ -494,6 +560,32 @@ struct palign_impl } }; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4i t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt index 8456dec15..0db8c558d 100644 --- a/Eigen/src/Core/arch/CMakeLists.txt +++ b/Eigen/src/Core/arch/CMakeLists.txt @@ -1,4 +1,5 @@ ADD_SUBDIRECTORY(SSE) ADD_SUBDIRECTORY(AltiVec) ADD_SUBDIRECTORY(NEON) +ADD_SUBDIRECTORY(AVX) ADD_SUBDIRECTORY(Default) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 8d9255eef..259f2e7b8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -28,6 +28,7 @@ struct Packet2cf template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -46,7 +47,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -110,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); + res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); + res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); + return Packet2cf(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); + to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); +} + template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) @@ -246,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 05e891df2..e5eb06f36 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -66,6 +66,7 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -83,6 +84,7 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -95,12 +97,13 @@ template<> struct packet_traits : default_packet_traits // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vdupq_n_s32(from); } @@ -219,6 +222,40 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + Packet4f res; + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + Packet4i res; + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); + return res; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); +} + template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ARM_PREFETCH(addr); } @@ -385,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) a_lo = vget_low_s32(a); a_hi = vget_high_s32(a); max = vpmax_s32(a_lo, a_hi); + max = vpmax_s32(max, max); return vget_lane_s32(max, 0); } @@ -410,9 +448,30 @@ PALIGN_NEON(0,Packet4i,vextq_s32) PALIGN_NEON(1,Packet4i,vextq_s32) PALIGN_NEON(2,Packet4i,vextq_s32) PALIGN_NEON(3,Packet4i,vextq_s32) - + #undef PALIGN_NEON +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); + kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); + kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); + kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); + kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); + kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); + kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 91bba5e38..758183c18 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -22,13 +22,18 @@ struct Packet2cf __m128 v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -42,8 +47,9 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } + + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, int stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, int stride) +{ + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pack #endif } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { @@ -214,7 +235,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } + { return Packet2cf(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper @@ -223,7 +244,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } + { return Packet2cf(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) @@ -248,13 +269,18 @@ struct Packet1cd __m128d v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; + typedef Packet1cd half; enum { Vectorizable = 1, AlignedOnScalar = 0, size = 1, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -268,12 +294,13 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); @@ -311,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -410,7 +437,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } + { return Packet1cd(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper @@ -419,7 +446,7 @@ template<> struct conj_helper { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } + { return Packet1cd(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) @@ -432,7 +459,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { - return Packet1cd(preverse(x.v)); + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; } } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 09f74c651..8f78b3a6c 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -63,7 +63,7 @@ Packet4f plog(const Packet4f& _x) x = _mm_or_ps(x, p4f_half); emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); /* part2: if( x < SQRTHF ) { @@ -72,9 +72,9 @@ Packet4f plog(const Packet4f& _x) } else { x = x - 1.0; } */ Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = _mm_and_ps(x, mask); + Packet4f tmp = pand(x, mask); x = psub(x, p4f_1); - e = psub(e, _mm_and_ps(p4f_1, mask)); + e = psub(e, pand(p4f_1, mask)); x = padd(x, tmp); Packet4f x2 = pmul(x,x); @@ -167,7 +167,7 @@ Packet4f pexp(const Packet4f& _x) emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, p4i_0x7f); emm0 = _mm_slli_epi32(emm0, 23); - return pmul(y, _mm_castsi128_ps(emm0)); + return pmul(y, Packet4f(_mm_castsi128_ps(emm0))); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) @@ -241,7 +241,7 @@ Packet2d pexp(const Packet2d& _x) emm0 = _mm_add_epi32(emm0, p4i_1023_0); emm0 = _mm_slli_epi32(emm0, 20); emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmul(x, _mm_castsi128_pd(emm0)); + return pmul(x, Packet2d(_mm_castsi128_pd(emm0))); } /* evaluation of 4 sines at onces, using SSE2 intrinsics. diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h old mode 100644 new mode 100755 index f5a3dab52..6912f3bc3 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,41 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#ifdef EIGEN_VECTORIZE_FMA +#ifndef EIGEN_HAS_FUSED_MADD +#define EIGEN_HAS_FUSED_MADD 1 +#endif +#endif + +#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER) +// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot +// have overloads for both types without linking error. +// One solution is to increase ABI version using -fabi-version=4 (or greater). +// To workaround this inconvenince, we rather wrap 128bit types into the following helper +// structure: +// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro? +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else typedef __m128 Packet4f; typedef __m128i Packet4i; typedef __m128d Packet2d; +#endif template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; @@ -58,13 +90,18 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet4i p4i_##NAME = pset1(X) +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, + HasHalfPacket = 0, HasDiv = 1, HasSin = EIGEN_FAST_MATH, @@ -77,19 +114,23 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet2d type; + typedef Packet2d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=2, + HasHalfPacket = 0, HasDiv = 1, HasExp = 1, HasSqrt = 1 }; }; +#endif template<> struct packet_traits : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { // FIXME check the Has* Vectorizable = 1, @@ -98,9 +139,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits { typedef double type; enum {size=2}; }; -template<> struct unpacket_traits { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4}; typedef Packet4i half; }; #if defined(_MSC_VER) && (_MSC_VER==1500) // Workaround MSVC 9 internal compiler error. @@ -110,13 +151,26 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } #else -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } +#endif template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } @@ -139,7 +193,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - return psub(_mm_setr_epi32(0,0,0,0), a); + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } @@ -173,6 +227,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } @@ -218,7 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if defined(_MSC_VER) template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -236,7 +294,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { E #endif } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } - template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } + template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } #else // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // require pointer casting to incompatible pointer types and leads to invalid code @@ -245,14 +303,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { E // TODO: do the same for MSVC (ICC is compatible) // NOTE: with the code below, MSVC's compiler crashes! -#if defined(__GNUC__) && defined(__i386__) +#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8))) // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1 #elif defined(__clang__) // bug 201: Segfaults in __mm_loadh_pd with clang 2.8 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 #else #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 + #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0 #endif template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) @@ -283,7 +344,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); #else __m128d res; res = _mm_load_sd((const double*)(from)) ; @@ -302,38 +363,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast(from)); + tmp = _mm_loadl_epi64(reinterpret_cast(from)); return vec4i_swizzle1(tmp, 0, 0, 1, 1); } template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast(to), from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE +#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES + _mm_storeu_pd(to, from); +#else _mm_storel_pd((to), from); _mm_storeh_pd((to+1), from); +#endif +} +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), Packet2d(_mm_castps_pd(from))); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), Packet2d(_mm_castsi128_pd(from))); } + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, int stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, int stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, int stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, int stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, int stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, int stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castps_pd(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), _mm_castsi128_pd(from)); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { Packet4f pa = _mm_set_ss(a); - pstore(to, vec4f_swizzle1(pa,0,0,0,0)); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) { Packet2d pa = _mm_set_sd(a); - pstore(to, vec2d_swizzle1(pa,0,0)); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 @@ -380,6 +480,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +// with AVX, the default implementations based on pload1 are faster +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif + EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); @@ -407,10 +539,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst(_mm_hadd_ps(tmp0, tmp0)); + return pfirst(_mm_hadd_ps(tmp0, tmp0)); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } // SSSE3 version: // EIGEN_STRONG_INLINE float predux(const Packet4i& a) @@ -453,7 +585,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); + return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) @@ -476,11 +608,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { @@ -496,17 +628,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) @@ -522,17 +654,17 @@ template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { #ifdef EIGEN_VECTORIZE_SSE4_1 Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); - return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); #else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) @@ -652,6 +784,31 @@ struct palign_impl }; #endif +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index a0fcea3f9..ec42e6850 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -320,6 +320,26 @@ struct functor_traits > }; }; + +/** \internal + * \brief Template functor to compute the atan of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atan() + */ +template struct scalar_atan_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) + inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); } + typedef typename packet_traits::type Packet; + inline Packet packetOp(const Packet& a) const { return internal::patan(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasATan + }; +}; + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 686ff84f1..7da52c2e8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,6 +10,7 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H + namespace Eigen { namespace internal { @@ -91,9 +92,26 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) }; manage_caching_sizes(GetAction, &l1, &l2); - k = std::min(k, l1/kdiv); - SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; - if(_m(k, l1/kdiv); +// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; +// if(_m(k,240); +// n = std::min(n,3840/sizeof(RhsScalar)); +// m = std::min(m,3840/sizeof(RhsScalar)); + + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); +#else + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); +#endif } template @@ -160,16 +178,19 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction must be 1 or 4 + nr = 4, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, +#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif - WorkSpaceFactor = nr * RhsPacketSize, - LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -186,32 +207,65 @@ public: { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload(b); + dest = pload(a); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload(a); + dest = ploadu(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const { + // It would be a lot cleaner to call pmadd all the time. Unfortunately if we + // let gcc allocate the register in which to store the result of the pmul + // (in the case where there is no FMA) gcc fails to figure out how to avoid + // spilling register. +#ifdef EIGEN_HAS_FUSED_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd(a,b,c); +#else tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); +#endif } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); } + + template + EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const + { + r = pmadd(c,alpha,r); + } protected: // conj_helper cj; @@ -235,12 +289,16 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, + nr = 4, +#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -258,15 +316,14 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = pset1(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -274,6 +331,21 @@ public: dest = pload(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu(a); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); @@ -281,7 +353,12 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_HAS_FUSED_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a.v,b,c.v); +#else tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -298,6 +375,38 @@ protected: conj_helper cj; }; +template +struct DoublePacket +{ + Packet first; + Packet second; +}; + +template +DoublePacket padd(const DoublePacket &a, const DoublePacket &b) +{ + DoublePacket res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} + +template +const DoublePacket& predux4(const DoublePacket &a) +{ + return a; +} + +template struct unpacket_traits > { typedef DoublePacket half; }; +// template +// DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) +// { +// DoublePacket res; +// res.first = padd(a.first, b.first); +// res.second = padd(a.second,b.second); +// return res; +// } + template class gebp_traits, std::complex, _ConjLhs, _ConjRhs > { @@ -314,60 +423,80 @@ public: && packet_traits::Vectorizable, RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, - - nr = 2, - mr = 2 * ResPacketSize, - WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, LhsProgress = ResPacketSize, - RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + RhsProgress = 1 }; typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; - struct DoublePacket - { - RealPacket first; - RealPacket second; - }; + typedef DoublePacket DoublePacketType; typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; + typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - typedef typename conditional::type AccPacket; + typedef typename conditional::type AccPacket; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } - EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { p.first = pset1(RealScalar(0)); p.second = pset1(RealScalar(0)); } - /* Unpack the rhs coeff such that each complex coefficient is spread into - * two packects containing respectively the real and imaginary coefficient - * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] - */ - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); - pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); - } - else - b[k] = rhs[k]; - } + dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + // Vectorized path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const { - dest.first = pload((const RealScalar*)b); - dest.second = pload((const RealScalar*)(b+ResPacketSize)); + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); } // nothing special here @@ -376,7 +505,12 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -389,7 +523,7 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const { // assemble c ResPacket tmp; @@ -440,12 +574,12 @@ public: ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, LhsProgress = ResPacketSize, - RhsProgress = ResPacketSize + RhsProgress = 1 }; typedef typename packet_traits::type _LhsPacket; @@ -463,21 +597,38 @@ public: p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) - { - for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); - } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pload(b); + dest = pset1(*b); } + + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// // FIXME not sure that's the best way to implement it! +// b0 = pload1(b+0); +// b1 = pload1(b+1); +// } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -486,7 +637,13 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_HAS_FUSED_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -519,6 +676,14 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + + typedef gebp_traits SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + enum { Vectorizable = Traits::Vectorizable, @@ -529,570 +694,708 @@ struct gebp_kernel EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; template EIGEN_DONT_INLINE void gebp_kernel ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) + Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; + SwappedTraits straits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; -// conj_helper pcj; - Index packet_cols = (cols/nr) * nr; - const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); - const Index peeled_kc = (depth/4)*4; - - if(unpackedB==0) - unpackedB = const_cast(blockB - strideB * nr * RhsProgress); - - // loops on each micro vertical panel of rhs (depth x nr) - for(Index j2=0; j2=4 ? (cols/4) * 4 : 0; + const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; + const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) + const Index peeled_kc = depth & ~(pk-1); + const Index prefetch_res_offset = 32/sizeof(ResScalar); +// const Index depth2 = depth & ~1; + + //---------- Process 3 * LhsProgress rows at once ---------- + // This corresponds to 3*LhsProgress x nr register blocks. + // Usually, make sense only with FMA + if(mr>=3*Traits::LhsProgress) { - traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); - - // loops on each largest micro horizontal panel of lhs (mr x depth) - // => we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i(alpha); - - R0 = ploadu(r0); - R1 = ploadu(r1); - R2 = ploadu(r2); - R3 = ploadu(r3); - R4 = ploadu(r0 + ResPacketSize); - R5 = ploadu(r1 + ResPacketSize); - R6 = ploadu(r2 + ResPacketSize); - traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r3 + ResPacketSize); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R3); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R5); - traits.acc(C6, alphav, R6); - traits.acc(C7, alphav, R0); + // We select a 3*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 3 x nr registers. - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); - } - else - { - ResPacket R0, R1, R4; - ResPacket alphav = pset1(alpha); + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)]; + prefetch(&blA[0]); - R0 = ploadu(r0); - R1 = ploadu(r1); - R4 = ploadu(r0 + ResPacketSize); + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7, + C8, C9, C10, C11; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = &res[(j2+1)*resStride + i]; + ResScalar* r2 = &res[(j2+2)*resStride + i]; + ResScalar* r3 = &res[(j2+3)*resStride + i]; + + internal::prefetch(r0); + internal::prefetch(r1); + internal::prefetch(r2); + internal::prefetch(r3); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + R2 = ploadu(r0+2*Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu(r1 + ResPacketSize); - traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r0+2*Traits::ResPacketSize, R2); + + R0 = ploadu(r1+0*Traits::ResPacketSize); + R1 = ploadu(r1+1*Traits::ResPacketSize); + R2 = ploadu(r1+2*Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C5, alphav, R1); + traits.acc(C9, alphav, R2); + pstoreu(r1+0*Traits::ResPacketSize, R0); + pstoreu(r1+1*Traits::ResPacketSize, R1); + pstoreu(r1+2*Traits::ResPacketSize, R2); + + R0 = ploadu(r2+0*Traits::ResPacketSize); + R1 = ploadu(r2+1*Traits::ResPacketSize); + R2 = ploadu(r2+2*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C10, alphav, R2); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r2+1*Traits::ResPacketSize, R1); + pstoreu(r2+2*Traits::ResPacketSize, R2); + + R0 = ploadu(r3+0*Traits::ResPacketSize); + R1 = ploadu(r3+1*Traits::ResPacketSize); + R2 = ploadu(r3+2*Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C7, alphav, R1); + traits.acc(C11, alphav, R2); + pstoreu(r3+0*Traits::ResPacketSize, R0); + pstoreu(r3+1*Traits::ResPacketSize, R1); + pstoreu(r3+2*Traits::ResPacketSize, R2); } - } - - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = unpackedB; - for(Index k=0; k(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + R2 = ploadu(r0+2*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8 , alphav, R2); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r0+2*Traits::ResPacketSize, R2); } - // process remaining peeled loop - for(Index k=peeled_kc; k(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - R0 = ploadu(r0); - R1 = ploadu(r1); - if(nr==4) R2 = ploadu(r2); - if(nr==4) R3 = ploadu(r3); - - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); - - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i do the same but with nr==1 - for(Index j2=packet_cols; j2=2*Traits::LhsProgress) { - // unpack B - traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); - - for(Index i=0; i(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + R2 = ploadu(r1+0*Traits::ResPacketSize); + R3 = ploadu(r1+1*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C5, alphav, R3); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + pstoreu(r1+0*Traits::ResPacketSize, R2); + pstoreu(r1+1*Traits::ResPacketSize, R3); + + R0 = ploadu(r2+0*Traits::ResPacketSize); + R1 = ploadu(r2+1*Traits::ResPacketSize); + R2 = ploadu(r3+0*Traits::ResPacketSize); + R3 = ploadu(r3+1*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C7, alphav, R3); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r2+1*Traits::ResPacketSize, R1); + pstoreu(r3+0*Traits::ResPacketSize, R2); + pstoreu(r3+1*Traits::ResPacketSize, R3); } - ResPacket R0, R4; - ResPacket alphav = pset1(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - - R0 = ploadu(r0); - R4 = ploadu(r0+ResPacketSize); - - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); - - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); - } - if(rows-peeled_mc>=LhsProgress) - { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - AccPacket C0; - traits.initAcc(C0); - - const RhsScalar* blB = unpackedB; - for(Index k=0; k(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r0+1*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r0+1*Traits::ResPacketSize, R1); + } + } + } + //---------- Process 1 * LhsProgress rows at once ---------- + if(mr>=1*Traits::LhsProgress) + { + // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) + for(Index i=peeled_mc2; i(alpha); + + R0 = ploadu(r0+0*Traits::ResPacketSize); + R1 = ploadu(r1+0*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + pstoreu(r0+0*Traits::ResPacketSize, R0); + pstoreu(r1+0*Traits::ResPacketSize, R1); + + R0 = ploadu(r2+0*Traits::ResPacketSize); + R1 = ploadu(r3+0*Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + pstoreu(r2+0*Traits::ResPacketSize, R0); + pstoreu(r3+0*Traits::ResPacketSize, R1); } - - ResPacket alphav = pset1(alpha); - ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); - traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); - } - for(Index i=peeled_mc2; i(alpha); + R0 = ploadu(r0+0*Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0+0*Traits::ResPacketSize, R0); + } + } + } + //---------- Process remaining rows, 1 at once ---------- + if(peeled_mc1::half,SResPacket>::type SResPacketHalf; + typedef typename conditional::half,SLhsPacket>::type SLhsPacketHalf; + typedef typename conditional::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional::half,SAccPacket>::type SAccPacketHalf; + + SResPacketHalf R = pgather(&res[j2*resStride + i], resStride); + SResPacketHalf alphav = pset1(alpha); + + if(depth-endk>0) + { + // We have to handle the last row of the rhs which corresponds to a half-packet + SLhsPacketHalf a0; + SRhsPacketHalf b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + SAccPacketHalf c0 = predux4(C0); + straits.madd(a0,b0,c0,b0); + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux4(C0), alphav, R); + } + pscatter(&res[j2*resStride + i], R, resStride); + } + else + { + SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + pscatter(&res[j2*resStride + i], R, resStride); + } + } + else // scalar path + { + // get a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + + for(Index k=0; k -struct gemm_pack_lhs +template +struct gemm_pack_lhs { EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; @@ -1131,64 +1434,176 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + + Index i=0; + + // Pack 3 packets + if(Pack1>=3*PacketSize) { - if(PanelMode) count += Pack1 * offset; - - if(StorageOrder==ColMajor) + for(; i=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + Packet A, B, C; + A = ploadu(&lhs(i+0*PacketSize, k)); + B = ploadu(&lhs(i+1*PacketSize, k)); + C = ploadu(&lhs(i+2*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(PanelMode) count += (3*PacketSize) * (stride-offset-depth); } - else + } + // Pack 2 packets + if(Pack1>=2*PacketSize) + { + for(; i(&lhs(i+0*PacketSize, k)); + B = ploadu(&lhs(i+1*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + } + if(PanelMode) count += (2*PacketSize) * (stride-offset-depth); + } + } + // Pack 1 packets + if(Pack1>=1*PacketSize) + { + for(; i(&lhs(i+0*PacketSize, k)); + pstore(blockA+count, cj.pconj(A)); + count+=PacketSize; + } + if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); + } + } + // Pack scalars + if(Pack21) + { + for(; i +struct gemm_pack_lhs +{ + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + const_blas_data_mapper lhs(_lhs,lhsStride); + Index count = 0; + +// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; +// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; +// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + int pack = Pack1; + Index i = 0; + while(pack>0) + { + Index remaining_rows = rows-i; + Index peeled_mc = i+(remaining_rows/pack)*pack; + for(; i=PacketSize) + { + for(; k kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } + count += PacketSize*pack; + } + } + for(; k=Pack2) - { - if(PanelMode) count += Pack2*offset; - for(Index k=0; k=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) +// { +// for(Index j2=0; j2 kernel; +// for (int p = 0; p < PacketSize; ++p) { +// kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); +// } +// ptranspose(kernel); +// for (int p = 0; p < PacketSize; ++p) { +// pstoreu(blockB+count, cj.pconj(kernel.packet[p])); +// count+=PacketSize; +// } +// } +// } +// for(; k=4) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - for(Index k=0; k kernel; + kernel.packet[0] = ploadu(&b0[k]); + kernel.packet[1] = ploadu(&b1[k]); + kernel.packet[2] = ploadu(&b2[k]); + kernel.packet[3] = ploadu(&b3[k]); + ptranspose(kernel); + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3])); + count+=4*PacketSize; + } + } + for(; k=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) +// { +// for(Index j2=0; j2(&rhs[k*rhsStride + j2]); +// pstoreu(blockB+count, cj.pconj(A)); +// } else if (PacketSize==4) { +// Packet A = ploadu(&rhs[k*rhsStride + j2]); +// Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); +// pstoreu(blockB+count, cj.pconj(A)); +// pstoreu(blockB+count+PacketSize, cj.pconj(B)); +// } else { +// const Scalar* b0 = &rhs[k*rhsStride + j2]; +// blockB[count+0] = cj(b0[0]); +// blockB[count+1] = cj(b0[1]); +// blockB[count+2] = cj(b0[2]); +// blockB[count+3] = cj(b0[3]); +// blockB[count+4] = cj(b0[4]); +// blockB[count+5] = cj(b0[5]); +// blockB[count+6] = cj(b0[6]); +// blockB[count+7] = cj(b0[7]); +// } +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k(&rhs[k*rhsStride + j2]); - pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; - } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr==4) blockB[count+2] = cj(b0[2]); - if(nr==4) blockB[count+3] = cj(b0[3]); - count += nr; + // skip what we have before + if(PanelMode) count += 4 * offset; + for(Index k=0; k(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + count += 4; + } } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2 struct general_matrix_matrix_product { + typedef gebp_traits Traits; + typedef typename scalar_product_traits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, @@ -51,6 +53,8 @@ template< struct general_matrix_matrix_product { +typedef gebp_traits Traits; + typedef typename scalar_product_traits::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, @@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth, const_blas_data_mapper lhs(_lhs,lhsStride); const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; - Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction - //Index nc = blocking.nc(); // cache block size along the N direction + Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -80,68 +82,68 @@ static void run(Index rows, Index cols, Index depth, Index tid = omp_get_thread_num(); Index threads = omp_get_num_threads(); - std::size_t sizeA = kc*mc; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); + LhsScalar* blockA = blocking.blockA(); + eigen_internal_assert(blockA!=0); - RhsScalar* blockB = blocking.blockB(); - eigen_internal_assert(blockB!=0); - + std::size_t sizeB = kc*nc; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); + // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... for(Index k=0; k rows of B', and cols of the A' // In order to reduce the chance that a thread has to wait for the other, - // let's start by packing A'. - pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + // let's start by packing B'. + pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); - // Pack B_k to B' in a parallel fashion: - // each thread packs the sub block B_k,j to B'_j where j is the thread id. + // Pack A_k to A' in a parallel fashion: + // each thread packs the sub block A_k,i to A'_i where i is the thread id. - // However, before copying to B'_j, we have to make sure that no other thread is still using it, + // However, before copying to A'_i, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); - pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); - - // Notify the other threads that the part B'_j is ready to go. + // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; - - // Computes C_i += A' * B' per B'_j + + // Computes C_i += A' * B' per A'_i for(Index shift=0; shift0) - while(info[j].sync!=k) {} - - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + while(info[i].sync!=k) {} + gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } - // Then keep going as usual with the remaining A' - for(Index i=mc; i Pack rhs's panel into a sequential chunk of memory (L2 caching) - // Note that this panel will be read as many times as the number of blocks in the lhs's - // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); - // For each mc x kc block of the lhs's vertical panel... - // (==GEPP_VAR1) - for(Index i2=0; i2 > template struct gemm_functor { - gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, - BlockingType& blocking) + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking) : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {} void initParallelSession() const { - m_blocking.allocateB(); + m_blocking.allocateA(); } void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo* info=0) const @@ -224,6 +221,8 @@ struct gemm_functor (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } + + typedef typename Gemm::Traits Traits; protected: const Lhs& m_lhs; @@ -245,7 +244,6 @@ class level3_blocking protected: LhsScalar* m_blockA; RhsScalar* m_blockB; - RhsScalar* m_blockW; DenseIndex m_mc; DenseIndex m_nc; @@ -254,7 +252,7 @@ class level3_blocking public: level3_blocking() - : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) + : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {} inline DenseIndex mc() const { return m_mc; } @@ -263,7 +261,6 @@ class level3_blocking inline LhsScalar* blockA() { return m_blockA; } inline RhsScalar* blockB() { return m_blockB; } - inline RhsScalar* blockW() { return m_blockW; } }; template @@ -282,29 +279,25 @@ class gemm_blocking_space Traits; enum { SizeA = ActualRows * MaxDepth, - SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Traits::WorkSpaceFactor + SizeB = ActualCols * MaxDepth }; - EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; - EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; - EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; + EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB]; public: - gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false) { this->m_mc = ActualRows; this->m_nc = ActualCols; this->m_kc = MaxDepth; this->m_blockA = m_staticA; this->m_blockB = m_staticB; - this->m_blockW = m_staticW; } inline void allocateA() {} inline void allocateB() {} - inline void allocateW() {} inline void allocateAll() {} }; @@ -323,20 +316,28 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc); + if(full_rows) + { + DenseIndex m = this->m_mc; + computeProductBlockingSizes(this->m_kc, m, this->m_nc); + } + else // full columns + { + DenseIndex n = this->m_nc; + computeProductBlockingSizes(this->m_kc, this->m_mc, n); + } + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -351,24 +352,16 @@ class gemm_blocking_spacem_blockB = aligned_new(m_sizeB); } - void allocateW() - { - if(this->m_blockW==0) - this->m_blockW = aligned_new(m_sizeW); - } - void allocateAll() { allocateA(); allocateB(); - allocateW(); } ~gemm_blocking_space() { aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockB, m_sizeB); - aligned_delete(this->m_blockW, m_sizeW); } }; @@ -394,7 +387,37 @@ class GeneralProduct typedef internal::scalar_product_op BinOp; EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); } + + template + inline void evalTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) + dst.noalias() = m_lhs .lazyProduct( m_rhs ); + else + { + dst.setZero(); + scaleAndAddTo(dst,Scalar(1)); + } + } + template + inline void addTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) + dst.noalias() += m_lhs .lazyProduct( m_rhs ); + else + scaleAndAddTo(dst,Scalar(1)); + } + + template + inline void subTo(Dest& dst) const + { + if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0) + dst.noalias() -= m_lhs .lazyProduct( m_rhs ); + else + scaleAndAddTo(dst,Scalar(-1)); + } + template void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); @@ -417,7 +440,7 @@ class GeneralProduct (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 46be46f85..6070664d5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); - RhsScalar* blockB = allocatedBlockB + sizeW; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0); gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; @@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product processed with gebp or skipped if (UpLo==Lower) gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); + -1, -1, 0, 0); } } } @@ -136,7 +133,7 @@ struct tribb_kernel enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { gebp_kernel gebp_kernel; Matrix buffer; @@ -150,7 +147,7 @@ struct tribb_kernel if(UpLo==Upper) gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // selfadjoint micro block { @@ -158,7 +155,7 @@ struct tribb_kernel buffer.setZero(); // 1 - apply the kernel on the temporary buffer gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1 template TriangularView& TriangularView::assignProduct(const ProductBase& prod, const Scalar& alpha) { + eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols()); + general_product_to_triangular_selector::run(m_matrix.const_cast_derived(), prod.derived(), alpha); return *this; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index a73ce5ff0..340c51394 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + // Currently, it seems to be better to perform unaligned loads anyway + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + alignmentPattern = NoneAligned; + } else if (LhsPacketSize>1) { eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 5c3e9b7ac..4079063eb 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -73,13 +73,13 @@ namespace internal { template struct GemmParallelInfo { - GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} int volatile sync; int volatile users; - Index rhs_start; - Index rhs_length; + Index lhs_start; + Index lhs_length; }; template @@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos if((!Condition) || (omp_get_num_threads()>1)) return func(0,rows, 0,cols); - Index size = transpose ? cols : rows; + Index size = transpose ? rows : cols; // 2- compute the maximal number of threads from the size of the product: // FIXME this has to be fine tuned @@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos std::swap(rows,cols); Index blockCols = (cols / threads) & ~Index(0x3); - Index blockRows = (rows / threads) & ~Index(0x7); + Index blockRows = (rows / threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; GemmParallelInfo* info = new GemmParallelInfo[threads]; - #pragma omp parallel for schedule(static,1) num_threads(threads) - for(Index i=0; i +template struct symm_pack_lhs { template inline @@ -45,25 +45,32 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { + enum { PacketSize = packet_traits::size }; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i(blockA, lhs, cols, i, count); - } - - if(rows-peeled_mc>=Pack2) - { - pack(blockA, lhs, cols, peeled_mc, count); - peeled_mc += Pack2; - } + //Index peeled_mc3 = (rows/Pack1)*Pack1; + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + if(Pack1>=3*PacketSize) + for(Index i=0; i(blockA, lhs, cols, i, count); + + if(Pack1>=2*PacketSize) + for(Index i=peeled_mc3; i(blockA, lhs, cols, i, count); + + if(Pack1>=1*PacketSize) + for(Index i=peeled_mc2; i(blockA, lhs, cols, i, count); // do the same with mr==1 - for(Index i=peeled_mc; i rhs(_rhs,rhsStride); - Index packet_cols = (cols/nr)*nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } // second part: diagonal block - for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr) + Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) { - // again we can split vertically in three different parts (transpose, symmetric, normal) - // transpose - for(Index k=k2; k=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) { - for(Index k=k2; k=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2 the same with nr==1) - for(Index j2=packet_cols; j2 gebp_kernel; symm_pack_lhs pack_lhs; @@ -348,11 +427,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; + std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockB = allocatedBlockB; gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; @@ -423,11 +501,11 @@ struct SelfadjointProductMatrix NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( - lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha ); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 3f0618410..d93277cb8 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -265,12 +263,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; triangularBuffer.setZero(); @@ -304,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; + geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar)); pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); @@ -357,14 +354,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size conj; gebp_kernel gebp_kernel; @@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; @@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 1e6e355d6..8acca9c8c 100644 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -54,8 +54,25 @@ #endif #if defined EIGEN_USE_MKL +# include +/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ +# ifndef INTEL_MKL_VERSION +# undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ +# elif INTEL_MKL_VERSION < 100305 /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/ +# undef EIGEN_USE_MKL +# endif +# ifndef EIGEN_USE_MKL + /*If the MKL version is too old, undef everything*/ +# undef EIGEN_USE_MKL_ALL +# undef EIGEN_USE_BLAS +# undef EIGEN_USE_LAPACKE +# undef EIGEN_USE_MKL_VML +# undef EIGEN_USE_LAPACKE_STRICT +# undef EIGEN_USE_LAPACKE +# endif +#endif -#include +#if defined EIGEN_USE_MKL #include #define EIGEN_MKL_VML_THRESHOLD 128 diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 6aa716698..f84d20397 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -66,13 +66,24 @@ #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. +#define EIGEN_ALIGN_BYTES 16 + #ifdef EIGEN_DONT_ALIGN #ifndef EIGEN_DONT_ALIGN_STATICALLY #define EIGEN_DONT_ALIGN_STATICALLY #endif #define EIGEN_ALIGN 0 -#else +#elif !defined(EIGEN_DONT_VECTORIZE) + #if defined(__AVX__) + #undef EIGEN_ALIGN_BYTES + #define EIGEN_ALIGN_BYTES 32 + #endif #define EIGEN_ALIGN 1 +#else + #define EIGEN_ALIGN 0 #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable @@ -286,13 +297,19 @@ namespace Eigen { #endif #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES) #if EIGEN_ALIGN_STATICALLY #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 +#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT #else #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 +#define EIGEN_USER_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT #endif #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD @@ -326,13 +343,13 @@ namespace Eigen { #elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; \ - EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ template \ - EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other) { Base::operator=(other.derived()); return *this; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other) { Base::operator=(other.derived()); return *this; } #else #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; \ - EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ { \ Base::operator=(other); \ return *this; \ diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 286e963d2..390b60c74 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -32,7 +32,7 @@ // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ - && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 @@ -42,14 +42,14 @@ // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup -#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) +#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #endif -#if defined(__APPLE__) \ - || defined(_WIN64) \ +#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \ + || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #define EIGEN_MALLOC_ALREADY_ALIGNED 1 @@ -73,7 +73,7 @@ #define EIGEN_HAS_POSIX_MEMALIGN 0 #endif -#ifdef EIGEN_VECTORIZE_SSE +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX #define EIGEN_HAS_MM_MALLOC 1 #else #define EIGEN_HAS_MM_MALLOC 0 @@ -89,7 +89,7 @@ inline void throw_std_bad_alloc() #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else - std::size_t huge = -1; + std::size_t huge = static_cast(-1); new int[huge]; #endif } @@ -105,9 +105,9 @@ inline void throw_std_bad_alloc() */ inline void* handmade_aligned_malloc(std::size_t size) { - void *original = std::malloc(size+16); + void *original = std::malloc(size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = if (ptr == 0) return handmade_aligned_malloc(size); void *original = *(reinterpret_cast(ptr) - 1); std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); - original = std::realloc(original,size+16); + original = std::realloc(original,size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(15))) + 16); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); @@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed() {} #endif -/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment. +/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ inline void* aligned_malloc(size_t size) @@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size) #elif EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, 16, size)) result = 0; + if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0; #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, 16); + result = _mm_malloc(size, EIGEN_ALIGN_BYTES); #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_malloc(size, 16); + result = _aligned_malloc(size, EIGEN_ALIGN_BYTES); #else result = handmade_aligned_malloc(size); #endif @@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) // implements _mm_malloc/_mm_free based on the corresponding _aligned_ // functions. This may not always be the case and we just try to be safe. #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = generic_aligned_realloc(ptr,new_size,old_size); #endif #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_realloc(ptr,new_size,16); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -552,7 +552,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function #ifndef EIGEN_ALLOCA - #if (defined __linux__) + #if (defined __linux__) || (defined __APPLE__) #define EIGEN_ALLOCA alloca #elif defined(_MSC_VER) #define EIGEN_ALLOCA _alloca @@ -607,9 +607,9 @@ template class aligned_stack_memory_handler * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. */ #ifdef EIGEN_ALLOCA - - #if defined(__arm__) || defined(_WIN32) - #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16) + // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled. +#if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) #else #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #endif @@ -679,7 +679,7 @@ template class aligned_stack_memory_handler #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0))) /****************************************************************************/ @@ -700,98 +700,42 @@ template class aligned_stack_memory_handler * \sa \ref TopicStlContainers. */ template -class aligned_allocator +class aligned_allocator : public std::allocator { public: - typedef size_t size_type; - typedef std::ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; - template - struct rebind - { - typedef aligned_allocator other; - }; + template + struct rebind + { + typedef aligned_allocator other; + }; - pointer address( reference value ) const - { - return &value; - } + aligned_allocator() : std::allocator() {} - const_pointer address( const_reference value ) const - { - return &value; - } + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator() - { - } + template + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} - aligned_allocator( const aligned_allocator& ) - { - } + ~aligned_allocator() {} - template - aligned_allocator( const aligned_allocator& ) - { - } + pointer allocate(size_type num, const void* /*hint*/ = 0) + { + internal::check_size_for_overflow(num); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + } - ~aligned_allocator() - { - } - - size_type max_size() const - { - return (std::numeric_limits::max)(); - } - - pointer allocate( size_type num, const void* hint = 0 ) - { - EIGEN_UNUSED_VARIABLE(hint); - internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc( num * sizeof(T) ) ); - } - - void construct( pointer p, const T& value ) - { - ::new( p ) T( value ); - } - -#if (__cplusplus >= 201103L) - template - void construct( U* u, Args&&... args) - { - ::new( static_cast(u) ) U( std::forward( args )... ); - } -#endif - - void destroy( pointer p ) - { - p->~T(); - } - -#if (__cplusplus >= 201103L) - template - void destroy( U* u ) - { - u->~U(); - } -#endif - - void deallocate( pointer p, size_type /*num*/ ) - { - internal::aligned_free( p ); - } - - bool operator!=(const aligned_allocator& ) const - { return false; } - - bool operator==(const aligned_allocator& ) const - { return true; } + void deallocate(pointer p, size_type /*num*/) + { + internal::aligned_free(p); + } }; //---------- Cache sizes ---------- @@ -823,9 +767,9 @@ namespace internal { #ifdef EIGEN_CPUID -inline bool cpuid_is_vendor(int abcd[4], const char* vendor) +inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) { - return abcd[1]==(reinterpret_cast(vendor))[0] && abcd[3]==(reinterpret_cast(vendor))[1] && abcd[2]==(reinterpret_cast(vendor))[2]; + return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2]; } inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) @@ -967,13 +911,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) { #ifdef EIGEN_CPUID int abcd[4]; + const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e}; + const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163}; + const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!" // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); int max_std_funcs = abcd[1]; - if(cpuid_is_vendor(abcd,"GenuineIntel")) + if(cpuid_is_vendor(abcd,GenuineIntel)) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); - else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!")) + else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) queryCacheSizes_amd(l1,l2,l3); else // by default let's use Intel's API diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 5407645c9..14adb6795 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -101,6 +101,7 @@ template struct packet_traits; template struct unpacket_traits { typedef T type; + typedef T half; enum {size=1}; }; @@ -137,7 +138,7 @@ class compute_matrix_flags ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) #else 0 #endif diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index bf20e03ef..d2563d470 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -275,10 +275,11 @@ template class EigenSolver */ EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true); + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); - return m_realSchur.info(); + return m_info; } /** \brief Sets the maximum number of iterations allowed. */ @@ -302,6 +303,7 @@ template class EigenSolver EigenvalueType m_eivalues; bool m_isInitialized; bool m_eigenvectorsOk; + ComputationInfo m_info; RealSchur m_realSchur; MatrixType m_matT; @@ -366,12 +368,16 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect { using std::sqrt; using std::abs; + using std::max; + using numext::isfinite; eigen_assert(matrix.cols() == matrix.rows()); // Reduce to real Schur form. m_realSchur.compute(matrix, computeEigenvectors); + + m_info = m_realSchur.info(); - if (m_realSchur.info() == Success) + if (m_info == Success) { m_matT = m_realSchur.matrixT(); if (computeEigenvectors) @@ -385,14 +391,40 @@ EigenSolver::compute(const MatrixType& matrix, bool computeEigenvect if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) { m_eivalues.coeffRef(i) = m_matT.coeff(i, i); + if(!isfinite(m_eivalues.coeffRef(i))) + { + m_isInitialized = true; + m_eigenvectorsOk = false; + m_info = NumericalIssue; + return *this; + } ++i; } else { Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1)); - Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); + Scalar z; + // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); + // without overflow + { + Scalar t0 = m_matT.coeff(i+1, i); + Scalar t1 = m_matT.coeff(i, i+1); + Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1))); + t0 /= maxval; + t1 /= maxval; + Scalar p0 = p/maxval; + z = maxval * sqrt(abs(p0 * p0 + t0 * t1)); + } + m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z); m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z); + if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1)))) + { + m_isInitialized = true; + m_eigenvectorsOk = false; + m_info = NumericalIssue; + return *this; + } i += 2; } } @@ -581,7 +613,7 @@ void EigenSolver::doComputeEigenvectors() } else { - eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen + eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen } } diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index f424e6d7d..636712c2b 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -77,7 +77,9 @@ public: * represents an invalid rotation. */ template inline AngleAxis(const Scalar& angle, const MatrixBase& axis) : m_axis(axis), m_angle(angle) {} - /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */ + /** Constructs and initialize the angle-axis rotation from a quaternion \a q. + * This function implicitly normalizes the quaternion \a q. + */ template inline explicit AngleAxis(const QuaternionBase& q) { *this = q; } /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */ template @@ -149,29 +151,27 @@ typedef AngleAxis AngleAxisf; typedef AngleAxis AngleAxisd; /** Set \c *this from a \b unit quaternion. - * The axis is normalized. + * The resulting axis is normalized. * - * \warning As any other method dealing with quaternion, if the input quaternion - * is not normalized then the result is undefined. + * This function implicitly normalizes the quaternion \a q. */ template template AngleAxis& AngleAxis::operator=(const QuaternionBase& q) { - using std::acos; - EIGEN_USING_STD_MATH(min); - EIGEN_USING_STD_MATH(max); - using std::sqrt; - Scalar n2 = q.vec().squaredNorm(); - if (n2 < NumTraits::dummy_precision()*NumTraits::dummy_precision()) + using std::atan2; + Scalar n = q.vec().norm(); + if(n::epsilon()) + n = q.vec().stableNorm(); + if (n > Scalar(0)) { - m_angle = 0; - m_axis << 1, 0, 0; + m_angle = Scalar(2)*atan2(n, q.w()); + m_axis = q.vec() / n; } else { - m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1))); - m_axis = q.vec() / sqrt(n2); + m_angle = 0; + m_axis << 1, 0, 0; } return *this; } diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index a712692e5..11e5398d4 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -587,7 +587,7 @@ inline Derived& QuaternionBase::setFromTwoVectors(const MatrixBase::dummy_precision()) { - c = max(c,-1); + c = (max)(c,Scalar(-1)); Matrix m; m << v0.transpose(), v1.transpose(); JacobiSVD > svd(m, ComputeFullV); Vector3 axis = svd.matrixV().col(2); diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index f40644011..cb93acf6b 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -194,9 +194,9 @@ public: /** type of the matrix used to represent the linear part of the transformation */ typedef Matrix LinearMatrixType; /** type of read/write reference to the linear part of the transformation */ - typedef Block LinearPart; + typedef Block LinearPart; /** type of read reference to the linear part of the transformation */ - typedef const Block ConstLinearPart; + typedef const Block ConstLinearPart; /** type of read/write reference to the affine part of the transformation */ typedef typename internal::conditional& src, const MatrixBase& dst, boo const Index n = src.cols(); // number of measurements // required for demeaning ... - const RealScalar one_over_n = 1 / static_cast(n); + const RealScalar one_over_n = RealScalar(1) / static_cast(n); // computation of mean const VectorType src_mean = src.rowwise().sum() * one_over_n; @@ -136,16 +136,16 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if (sigma.determinant()<0) S(m-1) = -1; + if (sigma.determinant() 0 ) { + if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) { Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose(); } else { - const Scalar s = S(m-1); S(m-1) = -1; + const Scalar s = S(m-1); S(m-1) = Scalar(-1); Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); S(m-1) = s; } @@ -156,7 +156,7 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo if (with_scaling) { // Eq. (42) - const Scalar c = 1/src_var * svd.singularValues().dot(S); + const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S); // Eq. (41) Rt.col(m).head(m) = dst_mean; diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 1991c6527..60dbea5f5 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec typedef typename MatrixType::Index Index; enum { TFactorSize = MatrixType::ColsAtCompileTime }; Index nbVecs = vectors.cols(); - Matrix T(nbVecs,nbVecs); + Matrix T(nbVecs,nbVecs); make_block_householder_triangular_factor(T, vectors, hCoeffs); const TriangularView& V(vectors); diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 73ca9bfde..1f3c060d0 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -65,10 +65,10 @@ class DiagonalPreconditioner { typename MatType::InnerIterator it(mat,j); while(it && it.index()!=j) ++it; - if(it && it.index()==j) + if(it && it.index()==j && it.value()!=Scalar(0)) m_invdiag(j) = Scalar(1)/it.value(); else - m_invdiag(j) = 0; + m_invdiag(j) = Scalar(1); } m_isInitialized = true; return *this; diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 78cd9b90f..daf99e305 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -29,10 +29,11 @@ template struct traits > * * \param MatrixType the type of the matrix of which we are computing the LU decomposition * - * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A - * is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q - * are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal - * coefficients) of U are sorted in such a way that any zeros are at the end. + * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is + * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is + * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU + * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any + * zeros are at the end. * * This decomposition provides the generic approach to solving systems of linear equations, computing * the rank, invertibility, inverse, kernel, and determinant. @@ -546,8 +547,8 @@ typename internal::traits::Scalar FullPivLU::determinant } /** \returns the matrix represented by the decomposition, - * i.e., it returns the product: P^{-1} L U Q^{-1}. - * This function is provided for debug purpose. */ + * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$. + * This function is provided for debug purposes. */ template MatrixType FullPivLU::reconstructedMatrix() const { diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index b4da6531a..4e0609784 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -109,7 +109,7 @@ class NaturalOrdering * \class COLAMDOrdering * * Functor computing the \em column \em approximate \em minimum \em degree ordering - * The matrix should be in column-major format + * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()). */ template class COLAMDOrdering @@ -118,10 +118,14 @@ class COLAMDOrdering typedef PermutationMatrix PermutationType; typedef Matrix IndexVector; - /** Compute the permutation vector form a sparse matrix */ + /** Compute the permutation vector \a perm form the sparse matrix \a mat + * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + */ template void operator() (const MatrixType& mat, PermutationType& perm) { + eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering"); + Index m = mat.rows(); Index n = mat.cols(); Index nnz = mat.nonZeros(); diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index d78583ecc..a0c0c9172 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -415,6 +415,7 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, JacobiRotation *j_right) { using std::sqrt; + using std::abs; Matrix m; m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)), numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q)); @@ -428,9 +429,11 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q, } else { - RealScalar u = d / t; - rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u)); - rot1.s() = rot1.c() * u; + RealScalar t2d2 = numext::hypot(t,d); + rot1.c() = abs(t)/t2d2; + rot1.s() = d/t2d2; + if(tmakeJacobi(m,0,1); diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 6f615e250..5b95cc33f 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -338,7 +338,10 @@ const Block SparseMatrixBase::inner namespace internal { template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel, - bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) || (BlockRows==1 && !XprType::IsRowMajor)> + bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) + | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&". + // revert to || as soon as not needed anymore. + (BlockRows==1 && !XprType::IsRowMajor)> class GenericSparseBlockInnerIteratorImpl; } diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index a25bd83eb..aa8bfc0f5 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -1205,7 +1205,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse m_data.value(p) = m_data.value(p-1); --p; } - eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end"); + eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end"); m_innerNonZeros[outer]++; diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 267c48bc3..5fb5bc203 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -58,6 +58,7 @@ namespace internal { * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module * OrderingMethods \endlink module for the list of built-in and external ordering methods. * + * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). * */ template @@ -77,10 +78,23 @@ class SparseQR SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) { } + /** Construct a QR factorization of the matrix \a mat. + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * + * \sa compute() + */ SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) { compute(mat); } + + /** Computes the QR factorization of the sparse matrix \a mat. + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). + * + * \sa analyzePattern(), factorize() + */ void compute(const MatrixType& mat) { analyzePattern(mat); @@ -255,6 +269,8 @@ class SparseQR }; /** \brief Preprocessing step of a QR factorization + * + * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()). * * In this step, the fill-reducing permutation is computed and applied to the columns of A * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited. @@ -264,6 +280,7 @@ class SparseQR template void SparseQR::analyzePattern(const MatrixType& mat) { + eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR"); // Compute the column fill reducing ordering OrderingType ord; ord(mat, m_perm_c); diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h index 4ee8e5c10..aaf66330b 100644 --- a/Eigen/src/StlSupport/StdDeque.h +++ b/Eigen/src/StlSupport/StdDeque.h @@ -11,7 +11,7 @@ #ifndef EIGEN_STDDEQUE_H #define EIGEN_STDDEQUE_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" // Define the explicit instantiation (e.g. necessary for the Intel compiler) #if defined(__INTEL_COMPILER) || defined(__GNUC__) diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h index 627381ece..3c742430c 100644 --- a/Eigen/src/StlSupport/StdList.h +++ b/Eigen/src/StlSupport/StdList.h @@ -10,7 +10,7 @@ #ifndef EIGEN_STDLIST_H #define EIGEN_STDLIST_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" // Define the explicit instantiation (e.g. necessary for the Intel compiler) #if defined(__INTEL_COMPILER) || defined(__GNUC__) diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h index 40a9abefa..611664a2e 100644 --- a/Eigen/src/StlSupport/StdVector.h +++ b/Eigen/src/StlSupport/StdVector.h @@ -11,7 +11,7 @@ #ifndef EIGEN_STDVECTOR_H #define EIGEN_STDVECTOR_H -#include "Eigen/src/StlSupport/details.h" +#include "details.h" /** * This section contains a convenience MACRO which allows an easy specialization of diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index aea3375ed..ce462e951 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -141,6 +141,18 @@ tan() const return derived(); } +/** \returns an expression of the coefficient-wise arc tan of *this. + * + * Example: \include Cwise_atan.cpp + * Output: \verbinclude Cwise_atan.out + * + * \sa cos(), sin(), tan() + */ +inline const CwiseUnaryOp, Derived> +atan() const +{ + return derived(); +} /** \returns an expression of the coefficient-wise power of *this to the given exponent. * diff --git a/README.md b/README.md new file mode 100644 index 000000000..04fa63fce --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.** + +For more information go to http://eigen.tuxfamily.org/. diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 41ca8b3b6..8222271fb 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -2,6 +2,14 @@ // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out +// Compilation options: +// +// -DSCALAR=std::complex +// -DSCALARA=double or -DSCALARB=double +// -DHAVE_BLAS +// -DDECOUPLED +// + #include #include #include @@ -14,10 +22,18 @@ using namespace Eigen; #define SCALAR float #endif +#ifndef SCALARA +#define SCALARA SCALAR +#endif + +#ifndef SCALARB +#define SCALARB SCALAR +#endif + typedef SCALAR Scalar; typedef NumTraits::Real RealScalar; -typedef Matrix A; -typedef Matrix B; +typedef Matrix A; +typedef Matrix B; typedef Matrix C; typedef Matrix M; @@ -129,35 +145,62 @@ int main(int argc, char ** argv) int tries = 2; // number of tries, we keep the best int s = 2048; + int m = s; + int n = s; + int p = s; int cache_size = -1; bool need_help = false; - for (int i=1; i c t p\n"; + std::cout << argv[0] << " -s -c -t -p \n"; + std::cout << " : size\n"; + std::cout << " : rows columns depth\n"; return 1; } if(cache_size>0) setCpuCacheSizes(cache_size,96*cache_size); - int m = s; - int n = s; - int p = s; + A a(m,p); a.setRandom(); B b(p,n); b.setRandom(); C c(m,n); c.setOnes(); @@ -172,6 +215,7 @@ int main(int argc, char ** argv) // check the parallel product is correct #if defined EIGEN_HAS_OPENMP + Eigen::initParallel(); int procs = omp_get_max_threads(); if(procs>1) { @@ -188,11 +232,20 @@ int main(int argc, char ** argv) #elif defined HAVE_BLAS blas_gemm(a,b,r); c.noalias() += a * b; - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } #else - gemm(a,b,c); - r.noalias() += a.cast() * b.cast(); - if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; + if(1.*m*n*p<2000.*2000*2000) + { + gemm(a,b,c); + r.noalias() += a.cast() .lazyProduct( b.cast() ); + if(!r.isApprox(c)) { + std::cout << r - c << "\n"; + std::cerr << "Warning, your product is crap!\n\n"; + } + } #endif #ifdef HAVE_BLAS @@ -214,7 +267,7 @@ int main(int argc, char ** argv) { BenchTimer tmono; omp_set_num_threads(1); - Eigen::internal::setNbThreads(1); + Eigen::setNbThreads(1); c = rc; BENCH(tmono, tries, rep, gemm(a,b,c)); std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n"; @@ -223,6 +276,15 @@ int main(int argc, char ** argv) } #endif + if(1.*m*n*p<30*30*30) + { + BenchTimer tmt; + c = rc; + BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b)); + std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n"; + std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n"; + } + #ifdef DECOUPLED if((NumTraits::IsComplex) && (NumTraits::IsComplex)) { diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index 0ac5c15fc..b299d9899 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -104,6 +104,7 @@ add_subdirectory(libs/mtl4) add_subdirectory(libs/blitz) add_subdirectory(libs/tvmet) add_subdirectory(libs/STL) +add_subdirectory(libs/blaze) add_subdirectory(data) diff --git a/bench/btl/actions/action_axpby.hh b/bench/btl/actions/action_axpby.hh index 98511ab6a..dadd0ccf3 100644 --- a/bench/btl/actions/action_axpby.hh +++ b/bench/btl/actions/action_axpby.hh @@ -33,7 +33,7 @@ class Action_axpby { public : // Ctor - Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95) + Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size) { MESSAGE("Action_axpby Ctor"); diff --git a/bench/btl/actions/action_axpy.hh b/bench/btl/actions/action_axpy.hh index e4cb3a5bd..261be4cb8 100644 --- a/bench/btl/actions/action_axpy.hh +++ b/bench/btl/actions/action_axpy.hh @@ -35,7 +35,7 @@ public : // Ctor - Action_axpy( int size ):_size(size),_coef(1.0) + Action_axpy( int size ):_coef(1.0),_size(size) { MESSAGE("Action_axpy Ctor"); diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake index 6b9065206..14b1dee09 100644 --- a/bench/btl/cmake/FindATLAS.cmake +++ b/bench/btl/cmake/FindATLAS.cmake @@ -4,10 +4,7 @@ if (ATLAS_LIBRARIES) endif (ATLAS_LIBRARIES) find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) - -find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) @@ -22,14 +19,14 @@ find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) - set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS} ${ATLAS_F77BLAS} ${ATLAS_LIB}) + set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_LIB}) # search the default lapack lib link to it find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64) find_library(ATLAS_REFERENCE_LAPACK NAMES lapack) - if(ATLAS_REFERENCE_LAPACK) - set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) - endif() +# if(ATLAS_REFERENCE_LAPACK) +# set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) +# endif() endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) diff --git a/bench/btl/cmake/FindBLAZE.cmake b/bench/btl/cmake/FindBLAZE.cmake new file mode 100644 index 000000000..dba4c89f2 --- /dev/null +++ b/bench/btl/cmake/FindBLAZE.cmake @@ -0,0 +1,31 @@ +# - Try to find eigen2 headers +# Once done this will define +# +# BLAZE_FOUND - system has blaze lib +# BLAZE_INCLUDE_DIR - the blaze include directory +# +# Copyright (C) 2008 Gael Guennebaud +# Adapted from FindEigen.cmake: +# Copyright (c) 2006, 2007 Montel Laurent, +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + +if (BLAZE_INCLUDE_DIR) + + # in cache already + set(BLAZE_FOUND TRUE) + +else (BLAZE_INCLUDE_DIR) + +find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h + PATHS + ${INCLUDE_INSTALL_DIR} + ) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR) + +mark_as_advanced(BLAZE_INCLUDE_DIR) + +endif(BLAZE_INCLUDE_DIR) + diff --git a/bench/btl/cmake/FindGOTO.cmake b/bench/btl/cmake/FindGOTO.cmake deleted file mode 100644 index 67ea0934a..000000000 --- a/bench/btl/cmake/FindGOTO.cmake +++ /dev/null @@ -1,15 +0,0 @@ - -if (GOTO_LIBRARIES) - set(GOTO_FIND_QUIETLY TRUE) -endif (GOTO_LIBRARIES) - -find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR}) - -if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) - set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran") -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES) - -mark_as_advanced(GOTO_LIBRARIES) diff --git a/bench/btl/cmake/FindGOTO2.cmake b/bench/btl/cmake/FindGOTO2.cmake deleted file mode 100644 index baa68d213..000000000 --- a/bench/btl/cmake/FindGOTO2.cmake +++ /dev/null @@ -1,25 +0,0 @@ - -if (GOTO2_LIBRARIES) - set(GOTO2_FIND_QUIETLY TRUE) -endif (GOTO2_LIBRARIES) -# -# find_path(GOTO_INCLUDES -# NAMES -# cblas.h -# PATHS -# $ENV{GOTODIR}/include -# ${INCLUDE_INSTALL_DIR} -# ) - -find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR}) -find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR}) - -if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) - set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran") -endif() - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(GOTO2 DEFAULT_MSG - GOTO2_LIBRARIES) - -mark_as_advanced(GOTO2_LIBRARIES) diff --git a/bench/btl/cmake/FindOPENBLAS.cmake b/bench/btl/cmake/FindOPENBLAS.cmake new file mode 100644 index 000000000..c76fc251c --- /dev/null +++ b/bench/btl/cmake/FindOPENBLAS.cmake @@ -0,0 +1,17 @@ + +if (OPENBLAS_LIBRARIES) + set(OPENBLAS_FIND_QUIETLY TRUE) +endif (OPENBLAS_LIBRARIES) + +find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) +find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR}) + +if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) + set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(OPENBLAS DEFAULT_MSG + OPENBLAS_LIBRARIES) + +mark_as_advanced(OPENBLAS_LIBRARIES) diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt index e32213e22..39d2b5dc4 100644 --- a/bench/btl/data/action_settings.txt +++ b/bench/btl/data/action_settings.txt @@ -1,19 +1,19 @@ -aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000 -ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000 -atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000 +aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000 +ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000 +atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000 axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000 axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000 -matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000 -matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000 -trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000 -trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000 -trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000 -cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000 -complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000 -partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000 -tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000 -hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000 -symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000 -syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000 -ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000 -rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 \ No newline at end of file +matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000 +matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000 +trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000 +trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000 +trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000 +cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000 +complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000 +partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000 +tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000 +hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000 +symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000 +syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000 +ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000 +rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 diff --git a/bench/btl/data/perlib_plot_settings.txt b/bench/btl/data/perlib_plot_settings.txt index 6844bab28..f023cfe02 100644 --- a/bench/btl/data/perlib_plot_settings.txt +++ b/bench/btl/data/perlib_plot_settings.txt @@ -10,7 +10,7 @@ ublas ; with lines lw 3 lt 1 lc rgbcolor "#00b7ff" mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847" blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c" -GOTO ; with lines lw 3 lt 3 lc rgbcolor "#C05600" -GOTO2 ; with lines lw 3 lt 1 lc rgbcolor "#C05600" +OPENBLAS ; with lines lw 3 lt 1 lc rgbcolor "#C05600" C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96" ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c" +blaze ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index 4c355cd6e..0f62bd421 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -33,7 +33,7 @@ // min matrix size for matrix vector product bench #define MIN_MV 5 // max matrix size for matrix vector product bench -#define MAX_MV 3000 +#define MAX_MV 5000 // min matrix size for matrix matrix product bench #define MIN_MM 5 // max matrix size for matrix matrix product bench diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh index 4670f5113..92af1306a 100644 --- a/bench/btl/generic_bench/btl.hh +++ b/bench/btl/generic_bench/btl.hh @@ -176,7 +176,7 @@ public: if (_config!=NULL) { std::vector config = BtlString(_config).split(" \t\n"); - for (int i = 0; i -void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X) +void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X) { int ten=10; int nine=9; diff --git a/bench/btl/libs/BLAS/CMakeLists.txt b/bench/btl/libs/BLAS/CMakeLists.txt index de42fe047..0272ccad0 100644 --- a/bench/btl/libs/BLAS/CMakeLists.txt +++ b/bench/btl/libs/BLAS/CMakeLists.txt @@ -18,27 +18,14 @@ if (MKL_FOUND) endif (MKL_FOUND) -find_package(GOTO2) -if (GOTO2_FOUND) - btl_add_bench(btl_goto2 main.cpp) - if(BUILD_btl_goto2) - target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} ) - set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2") - endif(BUILD_btl_goto2) -endif (GOTO2_FOUND) - -find_package(GOTO) -if (GOTO_FOUND) - if(GOTO2_FOUND) - btl_add_bench(btl_goto main.cpp OFF) - else() - btl_add_bench(btl_goto main.cpp) - endif() - if(BUILD_btl_goto) - target_link_libraries(btl_goto ${GOTO_LIBRARIES} ) - set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO") - endif(BUILD_btl_goto) -endif (GOTO_FOUND) +find_package(OPENBLAS) +if (OPENBLAS_FOUND) + btl_add_bench(btl_openblas main.cpp) + if(BUILD_btl_openblas) + target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} ) + set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS") + endif(BUILD_btl_openblas) +endif (OPENBLAS_FOUND) find_package(ACML) if (ACML_FOUND) diff --git a/bench/btl/libs/BLAS/blas_interface_impl.hh b/bench/btl/libs/BLAS/blas_interface_impl.hh index 0e84df038..fc4ba2a1f 100644 --- a/bench/btl/libs/BLAS/blas_interface_impl.hh +++ b/bench/btl/libs/BLAS/blas_interface_impl.hh @@ -75,7 +75,6 @@ public : static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ int N2 = N*N; BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); - char uplo = 'L'; int info = 0; int * ipiv = (int*)alloca(sizeof(int)*N); BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info); @@ -92,7 +91,7 @@ public : BLAS_FUNC(trsm)(&right, &lower, ¬rans, &nonunit, &N, &N, &fone, L, &N, X, &N); } - static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){ + static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){ BLAS_FUNC(trmm)(&left, &lower, ¬rans,&nonunit, &N,&N,&fone,A,&N,B,&N); } @@ -101,7 +100,6 @@ public : static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ int N2 = N*N; BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); - char uplo = 'L'; int info = 0; int * ipiv = (int*)alloca(sizeof(int)*N); int * jpiv = (int*)alloca(sizeof(int)*N); @@ -134,8 +132,6 @@ public : } char uplo = 'U'; int info = 0; - int ilo = 1; - int ihi = N; int bsize = 64; int worksize = N*bsize; SCALAR* d = new SCALAR[3*N+worksize]; diff --git a/bench/btl/libs/BLAS/c_interface_base.h b/bench/btl/libs/BLAS/c_interface_base.h index 515d8dcfc..de613803b 100644 --- a/bench/btl/libs/BLAS/c_interface_base.h +++ b/bench/btl/libs/BLAS/c_interface_base.h @@ -17,12 +17,12 @@ public: typedef real* gene_matrix; typedef real* gene_vector; - static void free_matrix(gene_matrix & A, int N){ - delete A; + static void free_matrix(gene_matrix & A, int /*N*/){ + delete[] A; } static void free_vector(gene_vector & B){ - delete B; + delete[] B; } static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ diff --git a/bench/btl/libs/BLAS/main.cpp b/bench/btl/libs/BLAS/main.cpp index 8347c9f0b..564d55ef2 100644 --- a/bench/btl/libs/BLAS/main.cpp +++ b/bench/btl/libs/BLAS/main.cpp @@ -56,13 +56,13 @@ int main() bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); #ifdef HAS_LAPACK - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); #endif //bench > >(MIN_LU,MAX_LU,NB_POINT); diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh index 93e76bd55..ef4cc9233 100644 --- a/bench/btl/libs/STL/STL_interface.hh +++ b/bench/btl/libs/STL/STL_interface.hh @@ -44,9 +44,9 @@ public : return "STL"; } - static void free_matrix(gene_matrix & A, int N){} + static void free_matrix(gene_matrix & /*A*/, int /*N*/){} - static void free_vector(gene_vector & B){} + static void free_vector(gene_vector & /*B*/){} static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ A = A_stl; diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt new file mode 100644 index 000000000..f8b1b2ec3 --- /dev/null +++ b/bench/btl/libs/blaze/CMakeLists.txt @@ -0,0 +1,10 @@ + +find_package(BLAZE) +find_package(Boost) +if (BLAZE_FOUND AND Boost_FOUND) + include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS}) + btl_add_bench(btl_blaze main.cpp) + if(BUILD_btl_blaze) + target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) + endif() +endif () diff --git a/bench/btl/libs/blaze/blaze_interface.hh b/bench/btl/libs/blaze/blaze_interface.hh new file mode 100644 index 000000000..ed43ecdd4 --- /dev/null +++ b/bench/btl/libs/blaze/blaze_interface.hh @@ -0,0 +1,140 @@ +//===================================================== +// Copyright (C) 2008 Gael Guennebaud +//===================================================== +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +#ifndef BLAZE_INTERFACE_HH +#define BLAZE_INTERFACE_HH + +#include +#include +// using namespace blaze; + +#include + +template +class blaze_interface { + +public : + + typedef real real_type ; + + typedef std::vector stl_vector; + typedef std::vector stl_matrix; + + typedef blaze::DynamicMatrix gene_matrix; + typedef blaze::DynamicVector gene_vector; + + static inline std::string name() { return "blaze"; } + + static void free_matrix(gene_matrix & A, int N){ + return ; + } + + static void free_vector(gene_vector & B){ + return ; + } + + static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(A_stl[0].size(), A_stl.size()); + + for (int j=0; j ipvt(N); +// lu_factor(R, ipvt); +// } + +// static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){ +// X = lower_trisolve(L, B); +// } + + static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ + cible = source; + } + + static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){ + cible = source; + } + +}; + +#endif diff --git a/bench/btl/libs/blaze/main.cpp b/bench/btl/libs/blaze/main.cpp new file mode 100644 index 000000000..582a2956b --- /dev/null +++ b/bench/btl/libs/blaze/main.cpp @@ -0,0 +1,40 @@ +//===================================================== +// Copyright (C) 2008 Gael Guennebaud +//===================================================== +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +// +#include "utilities.h" +#include "blaze_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + bench > >(MIN_MV,MAX_MV,NB_POINT); +// bench > >(MIN_MV,MAX_MV,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} + + diff --git a/bench/btl/libs/eigen3/eigen3_interface.hh b/bench/btl/libs/eigen3/eigen3_interface.hh index 0c8aa74da..b821fd721 100644 --- a/bench/btl/libs/eigen3/eigen3_interface.hh +++ b/bench/btl/libs/eigen3/eigen3_interface.hh @@ -45,9 +45,9 @@ public : return EIGEN_MAKESTRING(BTL_PREFIX); } - static void free_matrix(gene_matrix & A, int N) {} + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} - static void free_vector(gene_vector & B) {} + static void free_vector(gene_vector & /*B*/) {} static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ A.resize(A_stl[0].size(), A_stl.size()); @@ -74,7 +74,7 @@ public : } static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){ - int N=A_stl.size(); + int N=A_stl.size(); for (int j=0;j().setZero(); X.template selfadjointView().rankUpdate(A); } - static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = A*B; } - static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ + static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = (A.template selfadjointView() * B); // internal::product_selfadjoint_vector(N,A.data(),N, B.data(), 1, X.data(), 1); } @@ -155,54 +155,54 @@ public : } } - static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ + static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ // internal::product_selfadjoint_rank2_update(N,A.data(),N, X.data(), 1, Y.data(), 1, -1); for(int j=0; j(c,s)); } - static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ + static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int /*N*/){ X.noalias() = (A.transpose()*B); } - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ Y += coef * X; } - static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){ + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ Y = a*X + b*Y; } - static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ cible = source; } - static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){ + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ cible = source; } - static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){ + static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int /*N*/){ X = L.template triangularView().solve(B); } - static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ + static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){ X = L.template triangularView().solve(B); } - static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ + static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){ X.noalias() = L.template triangularView() * B; } - static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){ + static inline void cholesky(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = X; internal::llt_inplace::blocked(C); //C = X.llt().matrixL(); @@ -211,11 +211,11 @@ public : // Cholesky::computeInPlaceBlock(C); } - static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ + static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = X.fullPivLu().matrixLU(); } - static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ + static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ Matrix piv(N); DenseIndex nb; C = X; @@ -223,13 +223,13 @@ public : // C = X.partialPivLu().matrixLU(); } - static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ + static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ typename Tridiagonalization::CoeffVectorType aux(N-1); C = X; internal::tridiagonalization_inplace(C, aux); } - static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){ + static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int /*N*/){ C = HessenbergDecomposition(X).packedMatrix(); } diff --git a/bench/btl/libs/eigen3/main_adv.cpp b/bench/btl/libs/eigen3/main_adv.cpp index efe5857e4..95865357e 100644 --- a/bench/btl/libs/eigen3/main_adv.cpp +++ b/bench/btl/libs/eigen3/main_adv.cpp @@ -29,14 +29,14 @@ BTL_MAIN; int main() { - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); - bench > >(MIN_MM,MAX_MM,NB_POINT); +// bench > >(MIN_LU,MAX_LU,NB_POINT); + bench > >(MIN_LU,MAX_LU,NB_POINT); return 0; } diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp new file mode 100644 index 000000000..f37a8bb5f --- /dev/null +++ b/bench/dense_solvers.cpp @@ -0,0 +1,76 @@ +#include +#include "BenchTimer.h" +#include +#include +#include +using namespace Eigen; + +std::map > results; + +template +void bench(int id, int size = Size) +{ + typedef Matrix Mat; + Mat A(size,size); + A.setRandom(); + A = A*A.adjoint(); + BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd; + + int tries = 3; + int rep = 1000/size; + if(rep==0) rep = 1; + rep = rep*rep; + + LLT llt(A); + LDLT ldlt(A); + PartialPivLU lu(A); + FullPivLU fplu(A); + HouseholderQR qr(A); + ColPivHouseholderQR cpqr(A); + FullPivHouseholderQR fpqr(A); + JacobiSVD jsvd(A.rows(),A.cols()); + + BENCH(t_llt, tries, rep, llt.compute(A)); + BENCH(t_ldlt, tries, rep, ldlt.compute(A)); + BENCH(t_lu, tries, rep, lu.compute(A)); + BENCH(t_fplu, tries, rep, fplu.compute(A)); + BENCH(t_qr, tries, rep, qr.compute(A)); + BENCH(t_cpqr, tries, rep, cpqr.compute(A)); + BENCH(t_fpqr, tries, rep, fpqr.compute(A)); + if(size<500) // JacobiSVD is really too slow for too large matrices + BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV)); + + results["LLT"][id] = t_llt.best(); + results["LDLT"][id] = t_ldlt.best(); + results["PartialPivLU"][id] = t_lu.best(); + results["FullPivLU"][id] = t_fplu.best(); + results["HouseholderQR"][id] = t_qr.best(); + results["ColPivHouseholderQR"][id] = t_cpqr.best(); + results["FullPivHouseholderQR"][id] = t_fpqr.best(); + results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0; +} + +int main() +{ + const int small = 8; + const int medium = 100; + const int large = 1000; + const int xl = 4000; + + bench(0); + bench(1,medium); + bench(2,large); + bench(3,xl); + + IOFormat fmt(3, 0, " \t", "\n", "", ""); + + std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; + std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; + std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; + std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; +} diff --git a/bench/spbench/spbenchstyle.h b/bench/spbench/spbenchstyle.h index 17a05ce71..f6a981778 100644 --- a/bench/spbench/spbenchstyle.h +++ b/bench/spbench/spbenchstyle.h @@ -91,4 +91,5 @@ void printBenchStyle(std::ofstream& out) \n\n"; } -#endif \ No newline at end of file + +#endif diff --git a/blas/README.txt b/blas/README.txt index 07a8bd92a..63a5203b9 100644 --- a/blas/README.txt +++ b/blas/README.txt @@ -1,9 +1,6 @@ This directory contains a BLAS library built on top of Eigen. -This is currently a work in progress which is far to be ready for use, -but feel free to contribute to it if you wish. - This module is not built by default. In order to compile it, you need to type 'make blas' from within your build dir. diff --git a/blas/common.h b/blas/common.h index 2bf642c6b..c39cc63a8 100644 --- a/blas/common.h +++ b/blas/common.h @@ -106,13 +106,13 @@ matrix(T* data, int rows, int cols, int stride) } template -Map, 0, InnerStride > vector(T* data, int size, int incr) +Map, 0, InnerStride > make_vector(T* data, int size, int incr) { return Map, 0, InnerStride >(data, size, InnerStride(incr)); } template -Map > vector(T* data, int size) +Map > make_vector(T* data, int size) { return Map >(data, size); } @@ -124,8 +124,8 @@ T* get_compact_vector(T* x, int n, int incx) return x; T* ret = new Scalar[n]; - if(incx<0) vector(ret,n) = vector(x,n,-incx).reverse(); - else vector(ret,n) = vector(x,n, incx); + if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse(); + else make_vector(ret,n) = make_vector(x,n, incx); return ret; } @@ -135,8 +135,8 @@ T* copy_back(T* x_cpy, T* x, int n, int incx) if(x_cpy==x) return 0; - if(incx<0) vector(x,n,-incx).reverse() = vector(x_cpy,n); - else vector(x,n, incx) = vector(x_cpy,n); + if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n); + else make_vector(x,n, incx) = make_vector(x_cpy,n); return x_cpy; } diff --git a/blas/double.cpp b/blas/double.cpp index 8fd0709ba..295b1d1f2 100644 --- a/blas/double.cpp +++ b/blas/double.cpp @@ -23,11 +23,10 @@ double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy) { if(*n<=0) return 0; - if(*incx==1 && *incy==1) return (vector(x,*n).cast().cwiseProduct(vector(y,*n).cast())).sum(); - else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cast().cwiseProduct(vector(y,*n,*incy).cast())).sum(); - else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cast().cwiseProduct(vector(y,*n,*incy).cast())).sum(); - else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cast().cwiseProduct(vector(y,*n,-*incy).reverse().cast())).sum(); - else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cast().cwiseProduct(vector(y,*n,-*incy).reverse().cast())).sum(); + if(*incx==1 && *incy==1) return (make_vector(x,*n).cast().cwiseProduct(make_vector(y,*n).cast())).sum(); + else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast().cwiseProduct(make_vector(y,*n,*incy).cast())).sum(); + else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast().cwiseProduct(make_vector(y,*n,*incy).cast())).sum(); + else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast())).sum(); + else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast())).sum(); else return 0; } - diff --git a/blas/level1_cplx_impl.h b/blas/level1_cplx_impl.h index ffe192481..719f5bac9 100644 --- a/blas/level1_cplx_impl.h +++ b/blas/level1_cplx_impl.h @@ -32,13 +32,14 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n, if(*n<=0) return 0; - if(*incx==1) return vector(x,*n).unaryExpr().sum(); - else return vector(x,*n,std::abs(*incx)).unaryExpr().sum(); + if(*incx==1) return make_vector(x,*n).unaryExpr().sum(); + else return make_vector(x,*n,std::abs(*incx)).unaryExpr().sum(); } // computes a dot product of a conjugated vector with another vector. int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres) { +// std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n"; Scalar* res = reinterpret_cast(pres); if(*n<=0) @@ -50,11 +51,11 @@ int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, in Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) *res = (vector(x,*n).dot(vector(y,*n))); - else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).dot(vector(y,*n,*incy))); - else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,*incy))); - else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).dot(vector(y,*n,-*incy).reverse())); - else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,-*incy).reverse())); + if(*incx==1 && *incy==1) *res = (make_vector(x,*n).dot(make_vector(y,*n))); + else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy))); + else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy))); + else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse())); + else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse())); return 0; } @@ -72,11 +73,11 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) *res = (vector(x,*n).cwiseProduct(vector(y,*n))).sum(); - else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); - else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); + if(*incx==1 && *incy==1) *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum(); + else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); + else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); return 0; } @@ -88,9 +89,9 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, Scalar* x = reinterpret_cast(px); if(*incx==1) - return vector(x,*n).stableNorm(); + return make_vector(x,*n).stableNorm(); - return vector(x,*n,*incx).stableNorm(); + return make_vector(x,*n,*incx).stableNorm(); } int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) @@ -102,8 +103,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal RealScalar c = *pc; RealScalar s = *ps; - StridedVectorType vx(vector(x,*n,std::abs(*incx))); - StridedVectorType vy(vector(y,*n,std::abs(*incy))); + StridedVectorType vx(make_vector(x,*n,std::abs(*incx))); + StridedVectorType vy(make_vector(y,*n,std::abs(*incy))); Reverse rvx(vx); Reverse rvy(vy); @@ -125,9 +126,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca // std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n"; - if(*incx==1) vector(x,*n) *= alpha; - else vector(x,*n,std::abs(*incx)) *= alpha; + if(*incx==1) make_vector(x,*n) *= alpha; + else make_vector(x,*n,std::abs(*incx)) *= alpha; return 0; } - diff --git a/blas/level1_impl.h b/blas/level1_impl.h index b08c2f6be..98e4c0963 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -17,11 +17,11 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx, if(*n<=0) return 0; - if(*incx==1 && *incy==1) vector(y,*n) += alpha * vector(x,*n); - else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx); - else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx); - else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse(); - else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse(); + if(*incx==1 && *incy==1) make_vector(y,*n) += alpha * make_vector(x,*n); + else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx); + else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx); + else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse(); + else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse(); return 0; } @@ -35,7 +35,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int // be carefull, *incx==0 is allowed !! if(*incx==1 && *incy==1) - vector(y,*n) = vector(x,*n); + make_vector(y,*n) = make_vector(x,*n); else { if(*incx<0) x = x - (*n-1)*(*incx); @@ -57,8 +57,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc Scalar* x = reinterpret_cast(px); DenseIndex ret; - if(*incx==1) vector(x,*n).cwiseAbs().maxCoeff(&ret); - else vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret); + if(*incx==1) make_vector(x,*n).cwiseAbs().maxCoeff(&ret); + else make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret); return ret+1; } @@ -66,10 +66,10 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *inc { if(*n<=0) return 0; Scalar* x = reinterpret_cast(px); - + DenseIndex ret; - if(*incx==1) vector(x,*n).cwiseAbs().minCoeff(&ret); - else vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret); + if(*incx==1) make_vector(x,*n).cwiseAbs().minCoeff(&ret); + else make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret); return ret+1; } @@ -77,7 +77,7 @@ int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealSc { using std::sqrt; using std::abs; - + Scalar& a = *reinterpret_cast(pa); Scalar& b = *reinterpret_cast(pb); RealScalar* c = pc; @@ -143,8 +143,8 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx) Scalar* x = reinterpret_cast(px); Scalar alpha = *reinterpret_cast(palpha); - if(*incx==1) vector(x,*n) *= alpha; - else vector(x,*n,std::abs(*incx)) *= alpha; + if(*incx==1) make_vector(x,*n) *= alpha; + else make_vector(x,*n,std::abs(*incx)) *= alpha; return 0; } @@ -156,12 +156,11 @@ int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) vector(y,*n).swap(vector(x,*n)); - else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx)); - else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx)); - else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse()); - else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse()); + if(*incx==1 && *incy==1) make_vector(y,*n).swap(make_vector(x,*n)); + else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx)); + else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx)); + else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse()); + else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse()); return 1; } - diff --git a/blas/level1_real_impl.h b/blas/level1_real_impl.h index 8acecdfc6..02586d519 100644 --- a/blas/level1_real_impl.h +++ b/blas/level1_real_impl.h @@ -19,8 +19,8 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx) if(*n<=0) return 0; - if(*incx==1) return vector(x,*n).cwiseAbs().sum(); - else return vector(x,*n,std::abs(*incx)).cwiseAbs().sum(); + if(*incx==1) return make_vector(x,*n).cwiseAbs().sum(); + else return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum(); } // computes a vector-vector dot product. @@ -33,11 +33,11 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - if(*incx==1 && *incy==1) return (vector(x,*n).cwiseProduct(vector(y,*n))).sum(); - else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum(); - else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); - else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); + if(*incx==1 && *incy==1) return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum(); + else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum(); + else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); + else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum(); else return 0; } @@ -50,8 +50,8 @@ Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx) Scalar* x = reinterpret_cast(px); - if(*incx==1) return vector(x,*n).stableNorm(); - else return vector(x,*n,std::abs(*incx)).stableNorm(); + if(*incx==1) return make_vector(x,*n).stableNorm(); + else return make_vector(x,*n,std::abs(*incx)).stableNorm(); } int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) @@ -64,8 +64,8 @@ int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar c = *reinterpret_cast(pc); Scalar s = *reinterpret_cast(ps); - StridedVectorType vx(vector(x,*n,std::abs(*incx))); - StridedVectorType vy(vector(y,*n,std::abs(*incy))); + StridedVectorType vx(make_vector(x,*n,std::abs(*incx))); + StridedVectorType vy(make_vector(y,*n,std::abs(*incy))); Reverse rvx(vx); Reverse rvy(vy); diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h index b850b6cd1..afa9a7493 100644 --- a/blas/level2_cplx_impl.h +++ b/blas/level2_cplx_impl.h @@ -57,8 +57,8 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_y, *n).setZero(); - else vector(actual_y, *n) *= beta; + if(beta==Scalar(0)) make_vector(actual_y, *n).setZero(); + else make_vector(actual_y, *n) *= beta; } if(alpha!=Scalar(0)) diff --git a/blas/level2_impl.h b/blas/level2_impl.h index 5f3941975..233c7b753 100644 --- a/blas/level2_impl.h +++ b/blas/level2_impl.h @@ -58,8 +58,8 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_c, actual_m).setZero(); - else vector(actual_c, actual_m) *= beta; + if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero(); + else make_vector(actual_c, actual_m) *= beta; } if(code>=4 || func[code]==0) @@ -206,7 +206,7 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca Scalar alpha = *reinterpret_cast(palpha); Scalar beta = *reinterpret_cast(pbeta); int coeff_rows = *kl+*ku+1; - + int info = 0; if(OP(*trans)==INVALID) info = 1; else if(*m<0) info = 2; @@ -218,26 +218,26 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca else if(*incy==0) info = 13; if(info) return xerbla_(SCALAR_SUFFIX_UP"GBMV ",&info,6); - + if(*m==0 || *n==0 || (alpha==Scalar(0) && beta==Scalar(1))) return 0; - + int actual_m = *m; int actual_n = *n; if(OP(*trans)!=NOTR) std::swap(actual_m,actual_n); - + Scalar* actual_x = get_compact_vector(x,actual_n,*incx); Scalar* actual_y = get_compact_vector(y,actual_m,*incy); - + if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_y, actual_m).setZero(); - else vector(actual_y, actual_m) *= beta; + if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero(); + else make_vector(actual_y, actual_m) *= beta; } - + MatrixType mat_coeffs(a,coeff_rows,*n,*lda); - + int nb = std::min(*n,(*m)+(*ku)); for(int j=0; j(pa); Scalar* x = reinterpret_cast(px); int coeff_rows = *k + 1; - + int info = 0; if(UPLO(*uplo)==INVALID) info = 1; else if(OP(*opa)==INVALID) info = 2; @@ -283,37 +283,37 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea else if(*incx==0) info = 9; if(info) return xerbla_(SCALAR_SUFFIX_UP"TBMV ",&info,6); - + if(*n==0) return 0; - + int actual_n = *n; - + Scalar* actual_x = get_compact_vector(x,actual_n,*incx); - + MatrixType mat_coeffs(a,coeff_rows,*n,*lda); - + int ku = UPLO(*uplo)==UPPER ? *k : 0; int kl = UPLO(*uplo)==LOWER ? *k : 0; - + for(int j=0; j<*n; ++j) { int start = std::max(0,j - ku); int end = std::min((*m)-1,j + kl); int len = end - start + 1; int offset = (ku) - j + start; - + if(OP(*trans)==NOTR) - vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len); + make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len); else if(OP(*trans)==TR) - actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value(); + actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value(); else - actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * vector(actual_x+start,len) ).value(); - } - + actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value(); + } + if(actual_x!=x) delete[] actual_x; if(actual_y!=y) delete[] copy_back(actual_y,y,actual_m,*incy); - + return 0; } #endif @@ -362,7 +362,7 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real Scalar* a = reinterpret_cast(pa); Scalar* x = reinterpret_cast(px); int coeff_rows = *k+1; - + int info = 0; if(UPLO(*uplo)==INVALID) info = 1; else if(OP(*op)==INVALID) info = 2; @@ -373,22 +373,22 @@ int EIGEN_BLAS_FUNC(tbsv)(char *uplo, char *op, char *diag, int *n, int *k, Real else if(*incx==0) info = 9; if(info) return xerbla_(SCALAR_SUFFIX_UP"TBSV ",&info,6); - + if(*n==0 || (*k==0 && DIAG(*diag)==UNIT)) return 0; - + int actual_n = *n; - + Scalar* actual_x = get_compact_vector(x,actual_n,*incx); - + int code = OP(*op) | (UPLO(*uplo) << 2) | (DIAG(*diag) << 3); if(code>=16 || func[code]==0) return 0; func[code](*n, *k, a, *lda, actual_x); - + if(actual_x!=x) delete[] copy_back(actual_x,x,actual_n,*incx); - + return 0; } @@ -521,4 +521,3 @@ int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar return 1; } - diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h index 8d56eaaa1..9722a4674 100644 --- a/blas/level2_real_impl.h +++ b/blas/level2_real_impl.h @@ -51,8 +51,8 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p if(beta!=Scalar(1)) { - if(beta==Scalar(0)) vector(actual_y, *n).setZero(); - else vector(actual_y, *n) *= beta; + if(beta==Scalar(0)) make_vector(actual_y, *n).setZero(); + else make_vector(actual_y, *n) *= beta; } int code = UPLO(*uplo); @@ -179,7 +179,7 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px Scalar* x_cpy = get_compact_vector(x,*n,*incx); Scalar* y_cpy = get_compact_vector(y,*n,*incy); - + int code = UPLO(*uplo); if(code>=2 || func[code]==0) return 0; @@ -366,5 +366,3 @@ int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx, return 1; } - - diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp index 0d57710fe..0422f79b7 100644 --- a/blas/xerbla.cpp +++ b/blas/xerbla.cpp @@ -1,7 +1,7 @@ #include -#if (defined __GNUC__) +#if (defined __GNUC__) && (!defined __MINGW32__) #define EIGEN_WEAK_LINKING __attribute__ ((weak)) #else #define EIGEN_WEAK_LINKING diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in index 257c595ed..e50f6dbe0 100644 --- a/cmake/Eigen3Config.cmake.in +++ b/cmake/Eigen3Config.cmake.in @@ -3,26 +3,26 @@ # Eigen3Config.cmake(.in) # Use the following variables to compile and link against Eigen: -# EIGEN_FOUND - True if Eigen was found on your system -# EIGEN_USE_FILE - The file making Eigen usable -# EIGEN_DEFINITIONS - Definitions needed to build with Eigen -# EIGEN_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found -# EIGEN_INCLUDE_DIRS - List of directories of Eigen and it's dependencies -# EIGEN_ROOT_DIR - The base directory of Eigen -# EIGEN_VERSION_STRING - A human-readable string containing the version -# EIGEN_VERSION_MAJOR - The major version of Eigen -# EIGEN_VERSION_MINOR - The minor version of Eigen -# EIGEN_VERSION_PATCH - The patch version of Eigen +# EIGEN3_FOUND - True if Eigen was found on your system +# EIGEN3_USE_FILE - The file making Eigen usable +# EIGEN3_DEFINITIONS - Definitions needed to build with Eigen +# EIGEN3_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found +# EIGEN3_INCLUDE_DIRS - List of directories of Eigen and it's dependencies +# EIGEN3_ROOT_DIR - The base directory of Eigen +# EIGEN3_VERSION_STRING - A human-readable string containing the version +# EIGEN3_VERSION_MAJOR - The major version of Eigen +# EIGEN3_VERSION_MINOR - The minor version of Eigen +# EIGEN3_VERSION_PATCH - The patch version of Eigen -set ( EIGEN_FOUND 1 ) -set ( EIGEN_USE_FILE "@EIGEN_USE_FILE@" ) +set ( EIGEN3_FOUND 1 ) +set ( EIGEN3_USE_FILE "@EIGEN_USE_FILE@" ) -set ( EIGEN_DEFINITIONS "@EIGEN_DEFINITIONS@" ) -set ( EIGEN_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" ) -set ( EIGEN_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" ) -set ( EIGEN_ROOT_DIR "@EIGEN_ROOT_DIR@" ) +set ( EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@" ) +set ( EIGEN3_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" ) +set ( EIGEN3_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" ) +set ( EIGEN3_ROOT_DIR "@EIGEN_ROOT_DIR@" ) -set ( EIGEN_VERSION_STRING "@EIGEN_VERSION_STRING@" ) -set ( EIGEN_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) -set ( EIGEN_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) -set ( EIGEN_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) +set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" ) +set ( EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) +set ( EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) +set ( EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index 7844bf4d3..0b5de95bb 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -49,7 +49,7 @@ else() set(EIGEN_MAKECOMMAND_PLACEHOLDER "${EIGEN_BUILD_COMMAND}") endif() -configure_file(${CMAKE_BINARY_DIR}/DartConfiguration.tcl ${CMAKE_BINARY_DIR}/DartConfiguration.tcl) +configure_file(${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl ${CMAKE_BINARY_DIR}/DartConfiguration.tcl) # restore default CMAKE_MAKE_PROGRAM set(CMAKE_MAKE_PROGRAM ${CMAKE_MAKE_PROGRAM_SAVE}) @@ -58,7 +58,7 @@ set(CMAKE_MAKE_PROGRAM ${CMAKE_MAKE_PROGRAM_SAVE}) unset(CMAKE_MAKE_PROGRAM_SAVE) unset(EIGEN_MAKECOMMAND_PLACEHOLDER) -configure_file(${CMAKE_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake) # some documentation of this function would be nice ei_init_testing() diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index fdc166bf8..9b9776894 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -264,6 +264,18 @@ macro(ei_testing_print_summary) message(STATUS "SSE4.2: Using architecture defaults") endif() + if(EIGEN_TEST_AVX) + message(STATUS "AVX: ON") + else() + message(STATUS "AVX: Using architecture defaults") + endif() + + if(EIGEN_TEST_FMA) + message(STATUS "FMA: ON") + else() + message(STATUS "FMA: Using architecture defaults") + endif() + if(EIGEN_TEST_ALTIVEC) message(STATUS "Altivec: ON") else() @@ -408,6 +420,10 @@ macro(ei_get_cxxflags VAR) set(${VAR} NEON) elseif(EIGEN_TEST_ALTIVEC) set(${VAR} ALVEC) + elseif(EIGEN_TEST_FMA) + set(${VAR} FMA) + elseif(EIGEN_TEST_AVX) + set(${VAR} AVX) elseif(EIGEN_TEST_SSE4_2) set(${VAR} SSE42) elseif(EIGEN_TEST_SSE4_1) diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake index d687b71f6..93f8a8fd8 100644 --- a/cmake/language_support.cmake +++ b/cmake/language_support.cmake @@ -33,7 +33,7 @@ function(workaround_9220 language language_works) file(WRITE ${CMAKE_BINARY_DIR}/language_tests/${language}/CMakeLists.txt ${text}) execute_process( - COMMAND ${CMAKE_COMMAND} . + COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language} RESULT_VARIABLE return_code OUTPUT_QUIET @@ -64,3 +64,4 @@ endfunction(workaround_9220) #message("CXX_language_works = ${CXX_language_works}") #workaround_9220(CXXp CXXp_language_works) #message("CXXp_language_works = ${CXXp_language_works}") + diff --git a/debug/gdb/printers.py b/debug/gdb/printers.py index 86996a4f9..0d67a5f99 100644 --- a/debug/gdb/printers.py +++ b/debug/gdb/printers.py @@ -49,7 +49,7 @@ class EigenMatrixPrinter: regex = re.compile('\<.*\>') m = regex.findall(tag)[0][1:-1] template_params = m.split(',') - template_params = map(lambda x:x.replace(" ", ""), template_params) + template_params = [x.replace(" ", "") for x in template_params] if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1': self.rows = val['m_storage']['m_rows'] @@ -88,8 +88,11 @@ class EigenMatrixPrinter: def __iter__ (self): return self - + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): row = self.currentRow col = self.currentCol @@ -151,8 +154,11 @@ class EigenQuaternionPrinter: def __iter__ (self): return self - + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): element = self.currentElement if self.currentElement >= 4: #there are 4 elements in a quanternion diff --git a/demos/opengl/quaternion_demo.cpp b/demos/opengl/quaternion_demo.cpp index 04165619b..dd323a4c9 100644 --- a/demos/opengl/quaternion_demo.cpp +++ b/demos/opengl/quaternion_demo.cpp @@ -234,7 +234,7 @@ void RenderingWidget::drawScene() gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitZ(), Color(0,0,1,1)); // draw the fractal object - float sqrt3 = internal::sqrt(3.); + float sqrt3 = std::sqrt(3.); glLightfv(GL_LIGHT0, GL_AMBIENT, Vector4f(0.5,0.5,0.5,1).data()); glLightfv(GL_LIGHT0, GL_DIFFUSE, Vector4f(0.5,1,0.5,1).data()); glLightfv(GL_LIGHT0, GL_SPECULAR, Vector4f(1,1,1,1).data()); diff --git a/demos/opengl/trackball.cpp b/demos/opengl/trackball.cpp index 77ac790c8..7c2da8e96 100644 --- a/demos/opengl/trackball.cpp +++ b/demos/opengl/trackball.cpp @@ -23,7 +23,7 @@ void Trackball::track(const Vector2i& point2D) { Vector3f axis = mLastPoint3D.cross(newPoint3D).normalized(); float cos_angle = mLastPoint3D.dot(newPoint3D); - if ( internal::abs(cos_angle) < 1.0 ) + if ( std::abs(cos_angle) < 1.0 ) { float angle = 2. * acos(cos_angle); if (mMode==Around) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index 4c8fe2f47..c4d021624 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -41,8 +41,8 @@ MatrixXd::Ones(rows,cols) // ones(rows,cols) C.setOnes(rows,cols) // C = ones(rows,cols) MatrixXd::Random(rows,cols) // rand(rows,cols)*2-1 // MatrixXd::Random returns uniform random numbers in (-1, 1). C.setRandom(rows,cols) // C = rand(rows,cols)*2-1 -VectorXd::LinSpace(size,low,high) // linspace(low,high,size)' -v.setLinSpace(size,low,high) // v = linspace(low,high,size)' +VectorXd::LinSpaced(size,low,high) // linspace(low,high,size)' +v.setLinSpaced(size,low,high) // v = linspace(low,high,size)' // Matrix slicing and blocks. All expressions listed here are read/write. @@ -168,6 +168,8 @@ x.cross(y) // cross(x, y) Requires #include A.cast(); // double(A) A.cast(); // single(A) A.cast(); // int32(A) +A.real(); // real(A) +A.imag(); // imag(A) // if the original type equals destination type, no work is done // Note that for most operations Eigen requires all operands to have the same type: diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 85af9f1d4..7bbf693a0 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -223,7 +223,7 @@ ALIASES = "only_for_vectors=This is only for vectors (either row- "note_about_using_kernel_to_study_multiple_solutions=If you need a complete analysis of the space of solutions, take the one solution obtained by this method and add to it elements of the kernel, as determined by kernel()." \ "note_about_checking_solutions=This method just tries to find as good a solution as possible. If you want to check whether a solution exists or if it is accurate, just call this function to get a result and then compute the error of this result, or use MatrixBase::isApprox() directly, for instance like this: \code bool a_solution_exists = (A*result).isApprox(b, precision); \endcode This method avoids dividing by zero, so that the non-existence of a solution doesn't by itself mean that you'll get \c inf or \c nan values." \ "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \ - "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\" + "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" ALIASES += "eigenAutoToc= " @@ -1583,6 +1583,7 @@ PREDEFINED = EIGEN_EMPTY_STRUCT \ EIGEN_VECTORIZE \ EIGEN_QT_SUPPORT \ EIGEN_STRONG_INLINE=inline \ + EIGEN_DEVICE_FUNC= \ "EIGEN2_SUPPORT_STAGE=99" \ "EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR)=template const CwiseBinaryOp, const Derived, const OtherDerived> METHOD(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const;" \ "EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS)=CwiseBinaryOp, const LHS, const RHS>" diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index c00be10d3..f0456ff52 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -140,7 +140,16 @@ x2 = solver.solve(b2); For direct methods, the solution are computed at the machine precision. Sometimes, the solution need not be too accurate. In this case, the iterative methods are more suitable and the desired accuracy can be set before the solve step using \b setTolerance(). For all the available functions, please, refer to the documentation of the \link IterativeLinearSolvers_Module Iterative solvers module \endlink. \section BenchmarkRoutine -Most of the time, all you need is to know how much time it will take to qolve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in MatrixMarket Coordinate format, and the routine returns the statistics from all available solvers in Eigen. +Most of the time, all you need is to know how much time it will take to qolve your system, and hopefully, what is the most suitable solver. In Eigen, we provide a benchmark routine that can be used for this purpose. It is very easy to use. In the build directory, navigate to bench/spbench and compile the routine by typing \b make \e spbenchsolver. Run it with --help option to get the list of all available options. Basically, the matrices to test should be in MatrixMarket Coordinate format, and the routine returns the statistics from all available solvers in Eigen. + +To export your matrices and right-hand-side vectors in the matrix-market format, you can the the unsupported SparseExtra module: +\code +#include +... +Eigen::saveMarket(A, "filename.mtx"); +Eigen::saveMarket(A, "filename_SPD.mtx", Eigen::Symmetric); // if A is symmetric-positive-definite +Eigen::saveMarketVector(B, "filename_b.mtx"); +\endcode The following table gives an example of XML statistics from several Eigen built-in and external solvers. diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 77f2c92ab..5bcff2c96 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -116,7 +116,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor - + @@ -132,7 +132,7 @@ For an introduction on linear solvers and decompositions, check this \link Tutor - + diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox index e6c41fd70..cb92ceeae 100644 --- a/doc/TutorialLinearAlgebra.dox +++ b/doc/TutorialLinearAlgebra.dox @@ -40,8 +40,9 @@ depending on your matrix and the trade-off you want to make: - - + + + @@ -49,6 +50,7 @@ depending on your matrix and the trade-off you want to make: + @@ -56,6 +58,7 @@ depending on your matrix and the trade-off you want to make: + @@ -63,20 +66,23 @@ depending on your matrix and the trade-off you want to make: + - + + + @@ -84,21 +90,31 @@ depending on your matrix and the trade-off you want to make: + - + + + + + + + + + +
JacobiSVD (two-sided) - Slow (but fast for small matrices)Excellent-Proven3Proven3 Yes Singular values/vectors, least squares Yes (and does least squares)Yes Eigenvalues/vectors -GoodExcellent Closed forms for 2x2 and 3x3
Decomposition MethodRequirements on the matrixSpeedRequirements
on the matrix
Speed
(small-to-medium)
Speed
(large)
Accuracy
partialPivLu() Invertible ++++ +
fullPivLu() None -- - +++
householderQr() None ++++ +
ColPivHouseholderQR colPivHouseholderQr() None+ ++-+++
FullPivHouseholderQR fullPivHouseholderQr() None -- - +++
llt() Positive definite ++++++ +
LDLT ldlt()Positive or negative semidefinitePositive or negative
semidefinite
++++ ++
JacobiSVDjacobiSvd()None- -- - -+++
All of these decompositions offer a solve() method that works as in the above example. For example, if your matrix is positive definite, the above table says that a very good -choice is then the LDLT decomposition. Here's an example, also demonstrating that using a general +choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general matrix (not a vector) as right hand side is possible. diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox index 992cf6f34..eb6787dbc 100644 --- a/doc/TutorialReductionsVisitorsBroadcasting.dox +++ b/doc/TutorialReductionsVisitorsBroadcasting.dox @@ -32,7 +32,7 @@ Eigen also provides the \link MatrixBase::norm() norm() \endlink method, which r These operations can also operate on matrices; in that case, a n-by-p matrix is seen as a vector of size (n*p), so for example the \link MatrixBase::norm() norm() \endlink method returns the "Frobenius" or "Hilbert-Schmidt" norm. We refrain from speaking of the \f$\ell^2\f$ norm of a matrix because that can mean different things. -If you want other \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNnorm

() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients. +If you want other \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNorm

() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients. The following example demonstrates these methods. diff --git a/doc/snippets/Cwise_atan.cpp b/doc/snippets/Cwise_atan.cpp new file mode 100644 index 000000000..446844726 --- /dev/null +++ b/doc/snippets/Cwise_atan.cpp @@ -0,0 +1,2 @@ +ArrayXd v = ArrayXd::LinSpaced(5,0,1); +cout << v.atan() << endl; diff --git a/lapack/eigenvalues.cpp b/lapack/eigenvalues.cpp index a1526ebcd..6141032ab 100644 --- a/lapack/eigenvalues.cpp +++ b/lapack/eigenvalues.cpp @@ -64,14 +64,14 @@ EIGEN_LAPACK_FUNC(syev,(char *jobz, char *uplo, int* n, Scalar* a, int *lda, Sca if(eig.info()==NoConvergence) { - vector(w,*n).setZero(); + make_vector(w,*n).setZero(); if(computeVectors) matrix(a,*n,*n,*lda).setIdentity(); //*info = 1; return 0; } - vector(w,*n) = eig.eigenvalues(); + make_vector(w,*n) = eig.eigenvalues(); if(computeVectors) matrix(a,*n,*n,*lda) = eig.eigenvectors(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 83cdb40b6..991803ca2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -13,11 +13,26 @@ if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) endforeach() endif() +# check if we have a Fortran compiler +include("../cmake/language_support.cmake") + +workaround_9220(Fortran EIGEN_Fortran_COMPILER_WORKS) + +if(EIGEN_Fortran_COMPILER_WORKS) + enable_language(Fortran OPTIONAL) + if(NOT CMAKE_Fortran_COMPILER) + set(EIGEN_Fortran_COMPILER_WORKS OFF) + endif() +endif() + +if(NOT EIGEN_Fortran_COMPILER_WORKS) + # search for a default Lapack library to complete Eigen's one + find_package(LAPACK) +endif() + # configure blas/lapack (use Eigen's ones) -set(BLAS_FOUND TRUE) -set(LAPACK_FOUND TRUE) -set(BLAS_LIBRARIES eigen_blas) -set(LAPACK_LIBRARIES eigen_lapack) +set(EIGEN_BLAS_LIBRARIES eigen_blas) +set(EIGEN_LAPACK_LIBRARIES eigen_lapack) set(EIGEN_TEST_MATRIX_DIR "" CACHE STRING "Enable testing of realword sparse matrices contained in the specified path") if(EIGEN_TEST_MATRIX_DIR) @@ -32,33 +47,33 @@ endif(EIGEN_TEST_MATRIX_DIR) set(SPARSE_LIBS " ") find_package(Cholmod) -if(CHOLMOD_FOUND AND BLAS_FOUND AND LAPACK_FOUND) +if(CHOLMOD_FOUND) add_definitions("-DEIGEN_CHOLMOD_SUPPORT") include_directories(${CHOLMOD_INCLUDES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) - set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) + set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "Cholmod, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "Cholmod, ") endif() find_package(Umfpack) -if(UMFPACK_FOUND AND BLAS_FOUND) +if(UMFPACK_FOUND) add_definitions("-DEIGEN_UMFPACK_SUPPORT") include_directories(${UMFPACK_INCLUDES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES}) - set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UmfPack, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ") endif() find_package(SuperLU) -if(SUPERLU_FOUND AND BLAS_FOUND) +if(SUPERLU_FOUND) add_definitions("-DEIGEN_SUPERLU_SUPPORT") include_directories(${SUPERLU_INCLUDES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES}) - set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") @@ -68,7 +83,7 @@ endif() find_package(Pastix) find_package(Scotch) find_package(Metis) -if(PASTIX_FOUND AND BLAS_FOUND) +if(PASTIX_FOUND) add_definitions("-DEIGEN_PASTIX_SUPPORT") include_directories(${PASTIX_INCLUDES}) if(SCOTCH_FOUND) @@ -80,8 +95,8 @@ if(PASTIX_FOUND AND BLAS_FOUND) else(SCOTCH_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "PaStiX, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") @@ -96,16 +111,14 @@ else() endif() find_package(SPQR) -if(SPQR_FOUND AND BLAS_FOUND AND LAPACK_FOUND) - if(CHOLMOD_FOUND) - add_definitions("-DEIGEN_SPQR_SUPPORT") - include_directories(${SPQR_INCLUDES}) - set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) - set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS}) - ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ") - else(CHOLMOD_FOUND) - ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ") - endif(CHOLMOD_FOUND) +if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) + add_definitions("-DEIGEN_SPQR_SUPPORT") + include_directories(${SPQR_INCLUDES}) + set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${SPQR_ALL_LIBS}) + ei_add_property(EIGEN_TESTED_BACKENDS "SPQR, ") +else() + ei_add_property(EIGEN_MISSING_BACKENDS "SPQR, ") endif() option(EIGEN_TEST_NOQT "Disable Qt support in unit tests" OFF) @@ -306,6 +319,9 @@ find_package(CUDA) if(CUDA_FOUND) set(CUDA_PROPAGATE_HOST_FLAGS OFF) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CUDA_NVCC_FLAGS "-ccbin /usr/bin/clang" CACHE STRING "nvcc flags" FORCE) + endif() cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") diff --git a/test/array.cpp b/test/array.cpp index 5f49fc1ea..010fead2d 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -178,6 +178,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.asin(), asin(m1)); VERIFY_IS_APPROX(m1.acos(), acos(m1)); VERIFY_IS_APPROX(m1.tan(), tan(m1)); + VERIFY_IS_APPROX(m1.atan(), atan(m1)); VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval())); diff --git a/test/cholesky.cpp b/test/cholesky.cpp index d4d90e467..569318f83 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -82,10 +82,6 @@ template void cholesky(const MatrixType& m) symm += a1 * a1.adjoint(); } - // to test if really Cholesky only uses the upper triangular part, uncomment the following - // FIXME: currently that fails !! - //symm.template part().setZero(); - { SquareMatrixType symmUp = symm.template triangularView(); SquareMatrixType symmLo = symm.template triangularView(); diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp index 8f8be3c0e..87f119b1e 100644 --- a/test/cholmod_support.cpp +++ b/test/cholmod_support.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index aa7f7a599..4c7e96c10 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -129,7 +129,7 @@ void test_cuda_basic() CALL_SUBTEST( run_and_compare_to_cuda(prod(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_c(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp index 8bbda1c94..c98cc80f0 100644 --- a/test/dynalloc.cpp +++ b/test/dynalloc.cpp @@ -10,11 +10,13 @@ #include "main.h" #if EIGEN_ALIGN -#define ALIGNMENT 16 +#define ALIGNMENT EIGEN_ALIGN_BYTES #else #define ALIGNMENT 1 #endif +typedef Matrix Vector8f; + void check_handmade_aligned_malloc() { for(int i = 1; i < 1000; i++) @@ -68,7 +70,7 @@ struct MyStruct { EIGEN_MAKE_ALIGNED_OPERATOR_NEW char dummychar; - Vector4f avec; + Vector8f avec; }; class MyClassA @@ -76,15 +78,19 @@ class MyClassA public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW char dummychar; - Vector4f avec; + Vector8f avec; }; template void check_dynaligned() { - T* obj = new T; - VERIFY(T::NeedsToAlign==1); - VERIFY(size_t(obj)%ALIGNMENT==0); - delete obj; + // TODO have to be updated once we support multiple alignment values + if(T::SizeAtCompileTime % ALIGNMENT == 0) + { + T* obj = new T; + VERIFY(T::NeedsToAlign==1); + VERIFY(size_t(obj)%ALIGNMENT==0); + delete obj; + } } void test_dynalloc() @@ -102,6 +108,7 @@ void test_dynalloc() CALL_SUBTEST(check_dynaligned() ); CALL_SUBTEST(check_dynaligned() ); CALL_SUBTEST(check_dynaligned() ); + CALL_SUBTEST(check_dynaligned() ); } // check static allocation, who knows ? diff --git a/test/eigen2/CMakeLists.txt b/test/eigen2/CMakeLists.txt index 84931e037..41a02f4ad 100644 --- a/test/eigen2/CMakeLists.txt +++ b/test/eigen2/CMakeLists.txt @@ -5,6 +5,11 @@ add_dependencies(buildtests eigen2_buildtests) add_definitions("-DEIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API") +# Disable unused warnings for this module +# As EIGEN2 support is deprecated, it is not really worth fixing them +ei_add_cxx_compiler_flag("-Wno-unused-local-typedefs") +ei_add_cxx_compiler_flag("-Wno-unused-but-set-variable") + ei_add_test(eigen2_meta) ei_add_test(eigen2_sizeof) ei_add_test(eigen2_dynalloc) diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index 005af81eb..92d33f66a 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -114,12 +114,24 @@ void test_eigensolver_generic() CALL_SUBTEST_2( { MatrixXd A(1,1); - A(0,0) = std::sqrt(-1.); + A(0,0) = std::sqrt(-1.); // is Not-a-Number Eigen::EigenSolver solver(A); - MatrixXd V(1, 1); - V(0,0) = solver.eigenvectors()(0,0).real(); + VERIFY_IS_EQUAL(solver.info(), NumericalIssue); } ); + // regression test for bug 793 +#ifdef EIGEN_TEST_PART_2 + { + MatrixXd a(3,3); + a << 0, 0, 1, + 1, 1, 1, + 1, 1e+200, 1; + Eigen::EigenSolver eig(a); + VERIFY_IS_APPROX(a * eig.pseudoEigenvectors(), eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()); + VERIFY_IS_APPROX(a * eig.eigenvectors(), eig.eigenvectors() * eig.eigenvalues().asDiagonal()); + } +#endif + TEST_SET_BUT_UNUSED_VARIABLE(s) } diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index befd7d483..ed5928f10 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -129,9 +129,9 @@ template void hyperplane_alignment() typedef Hyperplane Plane3a; typedef Hyperplane Plane3u; - EIGEN_ALIGN16 Scalar array1[4]; - EIGEN_ALIGN16 Scalar array2[4]; - EIGEN_ALIGN16 Scalar array3[4+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[4]; + EIGEN_ALIGN_DEFAULT Scalar array2[4]; + EIGEN_ALIGN_DEFAULT Scalar array3[4+1]; Scalar* array3u = array3+1; Plane3a *p1 = ::new(reinterpret_cast(array1)) Plane3a; @@ -146,7 +146,7 @@ template void hyperplane_alignment() VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs()); #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Plane3a)); #endif } diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index f0462d40a..58d8eb7f5 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -66,9 +66,9 @@ template void parametrizedline_alignment() typedef ParametrizedLine Line4a; typedef ParametrizedLine Line4u; - EIGEN_ALIGN16 Scalar array1[8]; - EIGEN_ALIGN16 Scalar array2[8]; - EIGEN_ALIGN16 Scalar array3[8+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[16]; + EIGEN_ALIGN_DEFAULT Scalar array2[16]; + EIGEN_ALIGN_DEFAULT Scalar array3[16+1]; Scalar* array3u = array3+1; Line4a *p1 = ::new(reinterpret_cast(array1)) Line4a; @@ -86,7 +86,7 @@ template void parametrizedline_alignment() VERIFY_IS_APPROX(p1->direction(), p3->direction()); #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Line4a)); #endif } diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 1694b32c7..de0f2aeda 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -181,9 +181,9 @@ template void mapQuaternion(void){ v1 = Vector3::Random(); Scalar a = internal::random(-Scalar(M_PI), Scalar(M_PI)); - EIGEN_ALIGN16 Scalar array1[4]; - EIGEN_ALIGN16 Scalar array2[4]; - EIGEN_ALIGN16 Scalar array3[4+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[4]; + EIGEN_ALIGN_DEFAULT Scalar array2[4]; + EIGEN_ALIGN_DEFAULT Scalar array3[4+1]; Scalar* array3unaligned = array3+1; MQuaternionA mq1(array1); @@ -232,9 +232,9 @@ template void quaternionAlignment(void){ typedef Quaternion QuaternionA; typedef Quaternion QuaternionUA; - EIGEN_ALIGN16 Scalar array1[4]; - EIGEN_ALIGN16 Scalar array2[4]; - EIGEN_ALIGN16 Scalar array3[4+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[4]; + EIGEN_ALIGN_DEFAULT Scalar array2[4]; + EIGEN_ALIGN_DEFAULT Scalar array3[4+1]; Scalar* arrayunaligned = array3+1; QuaternionA *q1 = ::new(reinterpret_cast(array1)) QuaternionA; @@ -248,7 +248,7 @@ template void quaternionAlignment(void){ VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs()); VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs()); #if defined(EIGEN_VECTORIZE) && EIGEN_ALIGN_STATICALLY - if(internal::packet_traits::Vectorizable) + if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) VERIFY_RAISES_ASSERT((::new(reinterpret_cast(arrayunaligned)) QuaternionA)); #endif } diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index ee3030b5d..7d9080333 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -404,9 +404,9 @@ template void transform_alignment() typedef Transform Projective3a; typedef Transform Projective3u; - EIGEN_ALIGN16 Scalar array1[16]; - EIGEN_ALIGN16 Scalar array2[16]; - EIGEN_ALIGN16 Scalar array3[16+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[16]; + EIGEN_ALIGN_DEFAULT Scalar array2[16]; + EIGEN_ALIGN_DEFAULT Scalar array3[16+1]; Scalar* array3u = array3+1; Projective3a *p1 = ::new(reinterpret_cast(array1)) Projective3a; diff --git a/test/main.h b/test/main.h index dda451a40..a3c157126 100644 --- a/test/main.h +++ b/test/main.h @@ -31,6 +31,11 @@ // B0 is defined in POSIX header termios.h #define B0 FORBIDDEN_IDENTIFIER +// Unit tests calling Eigen's blas library must preserve the default blocking size +// to avoid troubles. +#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS +#define EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS +#endif // shuts down ICC's remark #593: variable "XXX" was set but never used #define TEST_SET_BUT_UNUSED_VARIABLE(X) X = X + 0; diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp index c18e687a5..5eba3ecb3 100644 --- a/test/mapped_matrix.cpp +++ b/test/mapped_matrix.cpp @@ -26,7 +26,7 @@ template void map_class_vector(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; Scalar array4[EIGEN_TESTMAP_MAX_SIZE]; Map(array1, size) = VectorType::Random(size); @@ -64,7 +64,7 @@ template void map_class_matrix(const MatrixType& m) for(int i = 0; i < size; i++) array2[i] = Scalar(1); Scalar* array3 = new Scalar[size+1]; for(int i = 0; i < size+1; i++) array3[i] = Scalar(1); - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; Map(array1, rows, cols) = MatrixType::Ones(rows,cols); Map(array2, rows, cols) = Map(array1, rows, cols); Map(array3unaligned, rows, cols) = Map(array1, rows, cols); @@ -90,7 +90,7 @@ template void map_static_methods(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; VectorType::MapAligned(array1, size) = VectorType::Random(size); VectorType::Map(array2, size) = VectorType::Map(array1, size); diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index ffedfb1f4..976e21e37 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -100,11 +100,25 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(sd*mcd*md, sd*mcd*md.template cast()); VERIFY_IS_APPROX(scd*md*mcd, scd*md.template cast().eval()*mcd); VERIFY_IS_APPROX(scd*mcd*md, scd*mcd*md.template cast()); - + VERIFY_IS_APPROX(sf*mf*mcf, sf*mf.template cast()*mcf); VERIFY_IS_APPROX(sf*mcf*mf, sf*mcf*mf.template cast()); VERIFY_IS_APPROX(scf*mf*mcf, scf*mf.template cast()*mcf); VERIFY_IS_APPROX(scf*mcf*mf, scf*mcf*mf.template cast()); + + VERIFY_IS_APPROX(sd*md.adjoint()*mcd, (sd*md).template cast().eval().adjoint()*mcd); + VERIFY_IS_APPROX(sd*mcd.adjoint()*md, sd*mcd.adjoint()*md.template cast()); + VERIFY_IS_APPROX(sd*md.adjoint()*mcd.adjoint(), (sd*md).template cast().eval().adjoint()*mcd.adjoint()); + VERIFY_IS_APPROX(sd*mcd.adjoint()*md.adjoint(), sd*mcd.adjoint()*md.template cast().adjoint()); + VERIFY_IS_APPROX(sd*md*mcd.adjoint(), (sd*md).template cast().eval()*mcd.adjoint()); + VERIFY_IS_APPROX(sd*mcd*md.adjoint(), sd*mcd*md.template cast().adjoint()); + + VERIFY_IS_APPROX(sf*mf.adjoint()*mcf, (sf*mf).template cast().eval().adjoint()*mcf); + VERIFY_IS_APPROX(sf*mcf.adjoint()*mf, sf*mcf.adjoint()*mf.template cast()); + VERIFY_IS_APPROX(sf*mf.adjoint()*mcf.adjoint(), (sf*mf).template cast().eval().adjoint()*mcf.adjoint()); + VERIFY_IS_APPROX(sf*mcf.adjoint()*mf.adjoint(), sf*mcf.adjoint()*mf.template cast().adjoint()); + VERIFY_IS_APPROX(sf*mf*mcf.adjoint(), (sf*mf).template cast().eval()*mcf.adjoint()); + VERIFY_IS_APPROX(sf*mcf*mf.adjoint(), sf*mcf*mf.template cast().adjoint()); VERIFY_IS_APPROX(sf*mf*vcf, (sf*mf).template cast().eval()*vcf); VERIFY_IS_APPROX(scf*mf*vcf,(scf*mf.template cast()).eval()*vcf); @@ -129,7 +143,9 @@ template void mixingtypes(int size = SizeAtCompileType) void test_mixingtypes() { - CALL_SUBTEST_1(mixingtypes<3>()); - CALL_SUBTEST_2(mixingtypes<4>()); - CALL_SUBTEST_3(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1(mixingtypes<3>()); + CALL_SUBTEST_2(mixingtypes<4>()); + CALL_SUBTEST_3(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); + } } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 2c0519c41..a51d31dbd 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -104,11 +104,12 @@ template void packetmath() const int PacketSize = internal::packet_traits::size; typedef typename NumTraits::Real RealScalar; - const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Packet packets[PacketSize*2]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + const int max_size = PacketSize > 4 ? PacketSize : 4; + const int size = PacketSize*max_size; + EIGEN_ALIGN_DEFAULT Scalar data1[size]; + EIGEN_ALIGN_DEFAULT Scalar data2[size]; + EIGEN_ALIGN_DEFAULT Packet packets[PacketSize*2]; + EIGEN_ALIGN_DEFAULT Scalar ref[size]; RealScalar refvalue = 0; for (int i=0; i void packetmath() else if (offset==1) internal::palign<1>(packets[0], packets[1]); else if (offset==2) internal::palign<2>(packets[0], packets[1]); else if (offset==3) internal::palign<3>(packets[0], packets[1]); + else if (offset==4) internal::palign<4>(packets[0], packets[1]); + else if (offset==5) internal::palign<5>(packets[0], packets[1]); + else if (offset==6) internal::palign<6>(packets[0], packets[1]); + else if (offset==7) internal::palign<7>(packets[0], packets[1]); internal::pstore(data2, packets[0]); for (int i=0; i void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "internal::pset1"); } + { + for (int i=0; i(data1, A0, A1, A2, A3); + internal::pstore(data2+0*PacketSize, A0); + internal::pstore(data2+1*PacketSize, A1); + internal::pstore(data2+2*PacketSize, A2); + internal::pstore(data2+3*PacketSize, A3); + VERIFY(areApprox(ref, data2, 4*PacketSize) && "internal::pbroadcast4"); + } + + { + for (int i=0; i(data1, A0, A1); + internal::pstore(data2+0*PacketSize, A0); + internal::pstore(data2+1*PacketSize, A1); + VERIFY(areApprox(ref, data2, 2*PacketSize) && "internal::pbroadcast2"); + } + VERIFY(internal::isApprox(data1[0], internal::pfirst(internal::pload(data1))) && "internal::pfirst"); if(PacketSize>1) @@ -178,11 +205,30 @@ template void packetmath() VERIFY(areApprox(ref, data2, PacketSize) && "ploaddup"); } } + if(PacketSize>2) + { + for(int offset=0;offset<4;++offset) + { + for(int i=0;i(data1+offset)); + VERIFY(areApprox(ref, data2, PacketSize) && "ploadquad"); + } + } ref[0] = 0; for (int i=0; i(data1)), refvalue) && "internal::predux"); + + { + for (int i=0; i<4; ++i) + ref[i] = 0; + for (int i=0; i(data1))); + VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux4"); + } ref[0] = 1; for (int i=0; i void packetmath() ref[i] = data1[PacketSize-i-1]; internal::pstore(data2, internal::preverse(internal::pload(data1))); VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse"); + + internal::PacketBlock kernel; + for (int i=0; i(data1+i*PacketSize); + } + ptranspose(kernel); + for (int i=0; i void packetmath_real() @@ -212,9 +270,9 @@ template void packetmath_real() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits::size*4]; for (int i=0; i void packetmath_notcomplex() typedef typename internal::packet_traits::type Packet; const int PacketSize = internal::packet_traits::size; - EIGEN_ALIGN16 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN16 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits::size*4]; Array::Map(data1, internal::packet_traits::size*4).setRandom(); @@ -317,10 +375,10 @@ template void packetmath_complex() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN16 Scalar data1[PacketSize*4]; - EIGEN_ALIGN16 Scalar data2[PacketSize*4]; - EIGEN_ALIGN16 Scalar ref[PacketSize*4]; - EIGEN_ALIGN16 Scalar pval[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar pval[PacketSize*4]; for (int i=0; i void packetmath_complex() internal::pstore(pval,internal::pcplxflip(internal::pload(data1))); VERIFY(areApprox(ref, pval, PacketSize) && "pcplxflip"); } - - +} + +template void packetmath_scatter_gather() { + typedef typename internal::packet_traits::type Packet; + typedef typename NumTraits::Real RealScalar; + const int PacketSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT Scalar data1[PacketSize]; + RealScalar refvalue = 0; + for (int i=0; i()/RealScalar(PacketSize); + } + EIGEN_ALIGN_DEFAULT Scalar buffer[PacketSize*11]; + memset(buffer, 0, 11*sizeof(Packet)); + Packet packet = internal::pload(data1); + internal::pscatter(buffer, packet, 11); + + for (int i = 0; i < PacketSize*11; ++i) { + if ((i%11) == 0) { + VERIFY(isApproxAbs(buffer[i], data1[i/11], refvalue) && "pscatter"); + } else { + VERIFY(isApproxAbs(buffer[i], Scalar(0), refvalue) && "pscatter"); + } + } + + for (int i=0; i()/RealScalar(PacketSize); + } + packet = internal::pgather(buffer, 7); + internal::pstore(data1, packet); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(data1[i], buffer[i*7], refvalue) && "pgather"); + } } void test_packetmath() @@ -349,8 +437,8 @@ void test_packetmath() CALL_SUBTEST_1( packetmath() ); CALL_SUBTEST_2( packetmath() ); CALL_SUBTEST_3( packetmath() ); - CALL_SUBTEST_1( packetmath >() ); - CALL_SUBTEST_2( packetmath >() ); + CALL_SUBTEST_4( packetmath >() ); + CALL_SUBTEST_5( packetmath >() ); CALL_SUBTEST_1( packetmath_notcomplex() ); CALL_SUBTEST_2( packetmath_notcomplex() ); @@ -359,7 +447,13 @@ void test_packetmath() CALL_SUBTEST_1( packetmath_real() ); CALL_SUBTEST_2( packetmath_real() ); - CALL_SUBTEST_1( packetmath_complex >() ); - CALL_SUBTEST_2( packetmath_complex >() ); + CALL_SUBTEST_4( packetmath_complex >() ); + CALL_SUBTEST_5( packetmath_complex >() ); + + CALL_SUBTEST_1( packetmath_scatter_gather() ); + CALL_SUBTEST_2( packetmath_scatter_gather() ); + CALL_SUBTEST_3( packetmath_scatter_gather() ); + CALL_SUBTEST_4( packetmath_scatter_gather >() ); + CALL_SUBTEST_5( packetmath_scatter_gather >() ); } } diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp index 14da0944b..49239e3a5 100644 --- a/test/pastix_support.cpp +++ b/test/pastix_support.cpp @@ -7,6 +7,8 @@ // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include #include diff --git a/test/ref.cpp b/test/ref.cpp index f639d900b..19e81549c 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -154,59 +154,79 @@ template void check_const_correctness(const PlainObjec VERIFY( !(Ref::Flags & LvalueBit) ); } -EIGEN_DONT_INLINE void call_ref_1(Ref ) { } -EIGEN_DONT_INLINE void call_ref_2(const Ref& ) { } -EIGEN_DONT_INLINE void call_ref_3(Ref > ) { } -EIGEN_DONT_INLINE void call_ref_4(const Ref >& ) { } -EIGEN_DONT_INLINE void call_ref_5(Ref > ) { } -EIGEN_DONT_INLINE void call_ref_6(const Ref >& ) { } +template +EIGEN_DONT_INLINE void call_ref_1(Ref a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_2(const Ref& a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_3(Ref > a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_4(const Ref >& a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_5(Ref > a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_6(const Ref >& a, const B &b) { VERIFY_IS_EQUAL(a,b); } +template +EIGEN_DONT_INLINE void call_ref_7(Ref > a, const B &b) { VERIFY_IS_EQUAL(a,b); } void call_ref() { - VectorXcf ca(10); - VectorXf a(10); + VectorXcf ca = VectorXcf::Random(10); + VectorXf a = VectorXf::Random(10); + RowVectorXf b = RowVectorXf::Random(10); + MatrixXf A = MatrixXf::Random(10,10); + RowVector3f c = RowVector3f::Random(); const VectorXf& ac(a); VectorBlock ab(a,0,3); - MatrixXf A(10,10); const VectorBlock abc(a,0,3); + - VERIFY_EVALUATION_COUNT( call_ref_1(a), 0); - //call_ref_1(ac); // does not compile because ac is const - VERIFY_EVALUATION_COUNT( call_ref_1(ab), 0); - VERIFY_EVALUATION_COUNT( call_ref_1(a.head(4)), 0); - VERIFY_EVALUATION_COUNT( call_ref_1(abc), 0); - VERIFY_EVALUATION_COUNT( call_ref_1(A.col(3)), 0); - // call_ref_1(A.row(3)); // does not compile because innerstride!=1 - VERIFY_EVALUATION_COUNT( call_ref_3(A.row(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_4(A.row(3)), 0); - //call_ref_1(a+a); // does not compile for obvious reason + VERIFY_EVALUATION_COUNT( call_ref_1(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(b,b.transpose()), 0); +// call_ref_1(ac); // does not compile because ac is const + VERIFY_EVALUATION_COUNT( call_ref_1(ab,ab), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(a.head(4),a.head(4)), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(abc,abc), 0); + VERIFY_EVALUATION_COUNT( call_ref_1(A.col(3),A.col(3)), 0); +// call_ref_1(A.row(3)); // does not compile because innerstride!=1 + VERIFY_EVALUATION_COUNT( call_ref_3(A.row(3),A.row(3).transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_4(A.row(3),A.row(3).transpose()), 0); +// call_ref_1(a+a); // does not compile for obvious reason - VERIFY_EVALUATION_COUNT( call_ref_2(A*A.col(1)), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_2(ac.head(5)), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(ac), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(a), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(ab), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(a.head(4)), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(a+a), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_2(ca.imag()), 1); // evaluated into a temp + MatrixXf tmp = A*A.col(1); + VERIFY_EVALUATION_COUNT( call_ref_2(A*A.col(1), tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_2(ac.head(5),ac.head(5)), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(ac,ac), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(ab,ab), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(a.head(4),a.head(4)), 0); + tmp = a+a; + VERIFY_EVALUATION_COUNT( call_ref_2(a+a,tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_2(ca.imag(),ca.imag()), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_4(ac.head(5)), 0); - VERIFY_EVALUATION_COUNT( call_ref_4(a+a), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_4(ca.imag()), 0); + VERIFY_EVALUATION_COUNT( call_ref_4(ac.head(5),ac.head(5)), 0); + tmp = a+a; + VERIFY_EVALUATION_COUNT( call_ref_4(a+a,tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_4(ca.imag(),ca.imag()), 0); - VERIFY_EVALUATION_COUNT( call_ref_5(a), 0); - VERIFY_EVALUATION_COUNT( call_ref_5(a.head(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_5(A), 0); - // call_ref_5(A.transpose()); // does not compile - VERIFY_EVALUATION_COUNT( call_ref_5(A.block(1,1,2,2)), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(a.head(3),a.head(3)), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(A,A), 0); +// call_ref_5(A.transpose()); // does not compile + VERIFY_EVALUATION_COUNT( call_ref_5(A.block(1,1,2,2),A.block(1,1,2,2)), 0); + VERIFY_EVALUATION_COUNT( call_ref_5(b,b), 0); // storage order do not match, but this is a degenerate case that should work + VERIFY_EVALUATION_COUNT( call_ref_5(a.row(3),a.row(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(a), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(a.head(3)), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(A.row(3)), 1); // evaluated into a temp thouth it could be avoided by viewing it as a 1xn matrix - VERIFY_EVALUATION_COUNT( call_ref_6(A+A), 1); // evaluated into a temp - VERIFY_EVALUATION_COUNT( call_ref_6(A), 0); - VERIFY_EVALUATION_COUNT( call_ref_6(A.transpose()), 1); // evaluated into a temp because the storage orders do not match - VERIFY_EVALUATION_COUNT( call_ref_6(A.block(1,1,2,2)), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(a,a), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(a.head(3),a.head(3)), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(A.row(3),A.row(3)), 1); // evaluated into a temp thouth it could be avoided by viewing it as a 1xn matrix + tmp = A+A; + VERIFY_EVALUATION_COUNT( call_ref_6(A+A,tmp), 1); // evaluated into a temp + VERIFY_EVALUATION_COUNT( call_ref_6(A,A), 0); + VERIFY_EVALUATION_COUNT( call_ref_6(A.transpose(),A.transpose()), 1); // evaluated into a temp because the storage orders do not match + VERIFY_EVALUATION_COUNT( call_ref_6(A.block(1,1,2,2),A.block(1,1,2,2)), 0); + + VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0); } void test_ref() diff --git a/test/sizeof.cpp b/test/sizeof.cpp index 7044d2062..7763e51bd 100644 --- a/test/sizeof.cpp +++ b/test/sizeof.cpp @@ -13,9 +13,9 @@ template void verifySizeOf(const MatrixType&) { typedef typename MatrixType::Scalar Scalar; if (MatrixType::RowsAtCompileTime!=Dynamic && MatrixType::ColsAtCompileTime!=Dynamic) - VERIFY(std::ptrdiff_t(sizeof(MatrixType))==std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime)); + VERIFY_IS_EQUAL(std::ptrdiff_t(sizeof(MatrixType)),std::ptrdiff_t(sizeof(Scalar))*std::ptrdiff_t(MatrixType::SizeAtCompileTime)); else - VERIFY(sizeof(MatrixType)==sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index)); + VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(typename MatrixType::Index)); } void test_sizeof() diff --git a/test/spqr_support.cpp b/test/spqr_support.cpp index b8980e081..901c42c40 100644 --- a/test/spqr_support.cpp +++ b/test/spqr_support.cpp @@ -5,6 +5,8 @@ // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse.h" #include diff --git a/test/superlu_support.cpp b/test/superlu_support.cpp index 3b16135bc..98a7bc5c8 100644 --- a/test/superlu_support.cpp +++ b/test/superlu_support.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp index 9eb84c14b..37ab11f0b 100644 --- a/test/umfpack_support.cpp +++ b/test/umfpack_support.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" #include diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index 601dbf214..d8815263a 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -9,6 +9,8 @@ #include "main.h" +typedef Matrix Vector8f; + struct TestNew1 { MatrixXd m; // good: m will allocate its own array, taking care of alignment. @@ -69,7 +71,7 @@ void construct_at_boundary(int boundary) { char buf[sizeof(T)+256]; size_t _buf = reinterpret_cast(buf); - _buf += (16 - (_buf % 16)); // make 16-byte aligned + _buf += (EIGEN_ALIGN_BYTES - (_buf % EIGEN_ALIGN_BYTES)); // make 16/32-byte aligned _buf += boundary; // make exact boundary-aligned T *x = ::new(reinterpret_cast(_buf)) T; x[0].setZero(); // just in order to silence warnings @@ -85,18 +87,18 @@ void unalignedassert() construct_at_boundary(16); construct_at_boundary(16); construct_at_boundary(4); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(4); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); construct_at_boundary(4); - construct_at_boundary(16); + construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); #endif @@ -110,14 +112,21 @@ void unalignedassert() check_unalignedassert_good >(); #if EIGEN_ALIGN_STATICALLY - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + if(EIGEN_ALIGN_BYTES==16) + { + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + } + for(int b=8; b(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + VERIFY_RAISES_ASSERT(construct_at_boundary(b)); + } #endif } diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp index ca7e159f3..d6ffeafdf 100644 --- a/test/unalignedcount.cpp +++ b/test/unalignedcount.cpp @@ -30,7 +30,14 @@ static int nb_storeu; void test_unalignedcount() { - #ifdef EIGEN_VECTORIZE_SSE + #if defined(EIGEN_VECTORIZE_AVX) + VectorXf a(40), b(40); + VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 5, 5, 5, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 5, 0, 5, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 5, 0, 5, 0); + #elif defined(EIGEN_VECTORIZE_SSE) VectorXf a(40), b(40); VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0); VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0); diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 23be0ffba..1fb079c48 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -137,8 +137,8 @@ template::Vectori typedef Matrix Matrix22; typedef Matrix Matrix44; typedef Matrix Matrix44u; - typedef Matrix Matrix44c; - typedef Matrix Matrix44r; + typedef Matrix Matrix44c; + typedef Matrix Matrix44r; typedef Matrix::Vectori LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), LinearTraversal,CompleteUnrolling)); - + VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling)); - + VERIFY(test_assign(Matrix(),Matrix()+Matrix(), LinearTraversal,NoUnrolling)); - - VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(10,4), - DefaultTraversal,CompleteUnrolling)); + + VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), + DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); } VERIFY(test_redux(Matrix3(), @@ -221,18 +221,23 @@ template::Vectori VERIFY(test_redux(Matrix44r().template block<1,2*PacketSize>(2,1), LinearVectorizedTraversal,CompleteUnrolling)); - + VERIFY((test_assign< Map >, Matrix22 >(InnerVectorizedTraversal,CompleteUnrolling))); VERIFY((test_assign< - Map >, - Matrix22 + Map, Aligned, InnerStride<3*PacketSize> >, + Matrix >(DefaultTraversal,CompleteUnrolling))); +<<<<<<< local VERIFY((test_assign(Matrix11(), Matrix11().lazyProduct(Matrix11()), InnerVectorizedTraversal, CompleteUnrolling))); +======= + VERIFY((test_assign(Matrix11(), Matrix()*Matrix(), + PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling))); +>>>>>>> other #endif VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3), diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f2c5129b3..049ce5596 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSOR_MODULE #define EIGEN_CXX11_TENSOR_MODULE -#include +#include #include diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry index 027c6087f..f1dc25fea 100644 --- a/unsupported/Eigen/CXX11/TensorSymmetry +++ b/unsupported/Eigen/CXX11/TensorSymmetry @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE #define EIGEN_CXX11_TENSORSYMMETRY_MODULE -#include +#include #include diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index d6b5d75d9..0e274b801 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -42,14 +42,14 @@ struct numeric_list { constexpr static std::size_t count = sizeof.. * typename gen_numeric_list_repeated::type numeric_list */ -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; template struct gen_numeric_list_repeated { typedef numeric_list type; }; @@ -285,7 +285,7 @@ struct equal_op { template constexpr static inli struct not_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } }; struct lesser_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; struct lesser_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; -struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; +struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } }; struct greater_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; /* generic unary operations */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 356ae10cf..d71a67590 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -48,13 +48,15 @@ namespace internal { * - libstdc++ from version 4.7 onwards has it nevertheless, * so use that * - libstdc++ older versions: use _M_instance directly - * - libc++ all versions so far: use __elems_ directly + * - libc++ from version 3.4 onwards has it IF compiled with + * -std=c++1y + * - libc++ older versions or -std=c++11: use __elems_ directly * - all other libs: use std::get to be portable, but * this may not be constexpr */ #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 #define STD_GET_ARR_HACK a._M_instance[I] -#elif defined(_LIBCPP_VERSION) +#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11) #define STD_GET_ARR_HACK a.__elems_[I] #else #define STD_GET_ARR_HACK std::template get(a) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index c6216e14c..70ca1433f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -91,9 +91,6 @@ struct tensor_index_linearization_helper return std_array_get(indices); } }; - -/* Forward-declaration required for the symmetry support. */ -template class tensor_symmetry_value_setter; } // end namespace internal template @@ -285,18 +282,6 @@ class Tensor #endif } - template - internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, Index firstIndex, IndexTypes... otherIndices) - { - return symCoeff(symmetry, std::array{{firstIndex, otherIndices...}}); - } - - template - internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, std::array const& indices) - { - return internal::tensor_symmetry_value_setter(*this, symmetry, indices); - } - protected: bool checkIndexRange(const std::array& indices) const { diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h index b5738b778..bc4f2025f 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -15,7 +15,7 @@ namespace Eigen { class DynamicSGroup { public: - inline explicit DynamicSGroup(std::size_t numIndices) : m_numIndices(numIndices), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } + inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { } inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); } inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } @@ -33,7 +33,7 @@ class DynamicSGroup template inline RV apply(const std::array& idx, RV initial, Args&&... args) const { - eigen_assert(N == m_numIndices); + eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); for (std::size_t i = 0; i < size(); i++) initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list::type()), m_elements[i].flags, initial, std::forward(args)...); return initial; @@ -42,7 +42,7 @@ class DynamicSGroup template inline RV apply(const std::vector& idx, RV initial, Args&&... args) const { - eigen_assert(idx.size() == m_numIndices); + eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); for (std::size_t i = 0; i < size(); i++) initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward(args)...); return initial; @@ -50,6 +50,19 @@ class DynamicSGroup inline int globalFlags() const { return m_globalFlags; } inline std::size_t size() const { return m_elements.size(); } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, std::array const& indices) const + { + return internal::tensor_symmetry_value_setter(tensor, *this, indices); + } private: struct GroupElement { std::vector representation; @@ -77,7 +90,7 @@ class DynamicSGroup template inline std::array h_permute(std::size_t which, const std::array& idx, internal::numeric_list) const { - return std::array{{ idx[m_elements[which].representation[n]]... }}; + return std::array{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; } template @@ -87,6 +100,8 @@ class DynamicSGroup result.reserve(idx.size()); for (auto k : m_elements[which].representation) result.push_back(idx[k]); + for (std::size_t i = m_numIndices; i < idx.size(); i++) + result.push_back(idx[i]); return result; } @@ -135,18 +150,18 @@ class DynamicSGroup }; // dynamic symmetry group that auto-adds the template parameters in the constructor -template +template class DynamicSGroupFromTemplateArgs : public DynamicSGroup { public: - inline DynamicSGroupFromTemplateArgs() : DynamicSGroup(NumIndices) + inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() { add_all(internal::type_list()); } inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { } inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { } - inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { DynamicSGroup::operator=(o); return *this; } - inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { DynamicSGroup::operator=(o); return *this; } + inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { DynamicSGroup::operator=(o); return *this; } + inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { DynamicSGroup::operator=(o); return *this; } private: template @@ -168,18 +183,32 @@ inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElem GroupElement result; result.representation.reserve(m_numIndices); - for (std::size_t i = 0; i < m_numIndices; i++) - result.representation.push_back(g2.representation[g1.representation[i]]); + for (std::size_t i = 0; i < m_numIndices; i++) { + int v = g2.representation[g1.representation[i]]; + eigen_assert(v >= 0); + result.representation.push_back(v); + } result.flags = g1.flags ^ g2.flags; return result; } inline void DynamicSGroup::add(int one, int two, int flags) { - eigen_assert(one >= 0 && (std::size_t)one < m_numIndices); - eigen_assert(two >= 0 && (std::size_t)two < m_numIndices); + eigen_assert(one >= 0); + eigen_assert(two >= 0); eigen_assert(one != two); - Generator g{one, two ,flags}; + + if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { + std::size_t newNumIndices = (one > two) ? one : two + 1; + for (auto& gelem : m_elements) { + gelem.representation.reserve(newNumIndices); + for (std::size_t i = m_numIndices; i < newNumIndices; i++) + gelem.representation.push_back(i); + } + m_numIndices = newNumIndices; + } + + Generator g{one, two, flags}; GroupElement e = ge(g); /* special case for first generator */ diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h index c5a630105..942293bd7 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -114,20 +114,24 @@ struct tensor_static_symgroup_equality template struct tensor_static_symgroup { - typedef StaticSGroup type; + typedef StaticSGroup type; constexpr static std::size_t size = type::static_size; }; -template -constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, internal::numeric_list) +template +constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, internal::numeric_list, internal::numeric_list) { - return {{ idx[ii]... }}; + return {{ idx[ii]..., idx[jj]... }}; } template static inline std::vector tensor_static_symgroup_index_permute(std::vector idx, internal::numeric_list) { - return {{ idx[ii]... }}; + std::vector result{{ idx[ii]... }}; + std::size_t target_size = idx.size(); + for (std::size_t i = result.size(); i < target_size; i++) + result.push_back(idx[i]); + return result; } template struct tensor_static_symgroup_do_apply; @@ -135,32 +139,35 @@ template struct tensor_static_symgroup_do_apply; template struct tensor_static_symgroup_do_apply> { - template - static inline RV run(const std::array& idx, RV initial, Args&&... args) + template + static inline RV run(const std::array& idx, RV initial, Args&&... args) { - initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward(args)...); - return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); + static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices."); + typedef typename internal::gen_numeric_list::type remaining_indices; + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); } - template + template static inline RV run(const std::vector& idx, RV initial, Args&&... args) { + eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward(args)...); - return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); + return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); } }; template struct tensor_static_symgroup_do_apply> { - template - static inline RV run(const std::array&, RV initial, Args&&...) + template + static inline RV run(const std::array&, RV initial, Args&&...) { // do nothing return initial; } - template + template static inline RV run(const std::vector&, RV initial, Args&&...) { // do nothing @@ -170,9 +177,10 @@ struct tensor_static_symgroup_do_apply +template class StaticSGroup { + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; typedef internal::group_theory::enumerate_group_elements< internal::tensor_static_symgroup_multiply, internal::tensor_static_symgroup_equality, @@ -182,20 +190,20 @@ class StaticSGroup typedef typename group_elements::type ge; public: constexpr inline StaticSGroup() {} - constexpr inline StaticSGroup(const StaticSGroup&) {} - constexpr inline StaticSGroup(StaticSGroup&&) {} + constexpr inline StaticSGroup(const StaticSGroup&) {} + constexpr inline StaticSGroup(StaticSGroup&&) {} - template - static inline RV apply(const std::array& idx, RV initial, Args&&... args) + template + static inline RV apply(const std::array& idx, RV initial, Args&&... args) { - return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); } template static inline RV apply(const std::vector& idx, RV initial, Args&&... args) { eigen_assert(idx.size() == NumIndices); - return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); } constexpr static std::size_t static_size = ge::count; @@ -204,6 +212,19 @@ class StaticSGroup return ge::count; } constexpr static inline int globalFlags() { return group_elements::global_flags; } + + template + inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, std::array const& indices) const + { + return internal::tensor_symmetry_value_setter>(tensor, *this, indices); + } }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h index f0813086a..879d6cd77 100644 --- a/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h @@ -30,6 +30,7 @@ template struct tenso template struct tensor_static_symgroup_if; template struct tensor_symmetry_calculate_flags; template struct tensor_symmetry_assign_value; +template struct tensor_symmetry_num_indices; } // end namespace internal @@ -94,7 +95,7 @@ class DynamicSGroup; * This class is a child class of DynamicSGroup. It uses the template arguments * specified to initialize itself. */ -template +template class DynamicSGroupFromTemplateArgs; /** \class StaticSGroup @@ -116,7 +117,7 @@ class DynamicSGroupFromTemplateArgs; * group becomes too large. (In that case, unrolling may not even be * beneficial.) */ -template +template class StaticSGroup; /** \class SGroup @@ -131,24 +132,50 @@ class StaticSGroup; * \sa StaticSGroup * \sa DynamicSGroup */ -template -class SGroup : public internal::tensor_symmetry_pre_analysis::root_type +template +class SGroup : public internal::tensor_symmetry_pre_analysis::value, Gen...>::root_type { public: + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; typedef typename internal::tensor_symmetry_pre_analysis::root_type Base; // make standard constructors + assignment operators public inline SGroup() : Base() { } - inline SGroup(const SGroup& other) : Base(other) { } - inline SGroup(SGroup&& other) : Base(other) { } - inline SGroup& operator=(const SGroup& other) { Base::operator=(other); return *this; } - inline SGroup& operator=(SGroup&& other) { Base::operator=(other); return *this; } + inline SGroup(const SGroup& other) : Base(other) { } + inline SGroup(SGroup&& other) : Base(other) { } + inline SGroup& operator=(const SGroup& other) { Base::operator=(other); return *this; } + inline SGroup& operator=(SGroup&& other) { Base::operator=(other); return *this; } // all else is defined in the base class }; namespace internal { +template struct tensor_symmetry_num_indices +{ + constexpr static std::size_t value = 1; +}; + +template struct tensor_symmetry_num_indices, Sym...> +{ +private: + constexpr static std::size_t One = static_cast(One_); + constexpr static std::size_t Two = static_cast(Two_); + constexpr static std::size_t Three = tensor_symmetry_num_indices::value; + + // don't use std::max, since it's not constexpr until C++14... + constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; +public: + constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; +}; + +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; + /** \internal * * \class tensor_symmetry_pre_analysis @@ -199,7 +226,7 @@ namespace internal { template struct tensor_symmetry_pre_analysis { - typedef StaticSGroup root_type; + typedef StaticSGroup<> root_type; }; template @@ -212,7 +239,7 @@ struct tensor_symmetry_pre_analysis typedef typename conditional< possible_size == 0 || possible_size >= max_static_elements, - DynamicSGroupFromTemplateArgs, + DynamicSGroupFromTemplateArgs, typename helper::type >::type root_type; }; @@ -266,7 +293,7 @@ struct tensor_symmetry_calculate_flags } }; -template +template class tensor_symmetry_value_setter { public: diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt index f3180b52b..8eb2808e3 100644 --- a/unsupported/Eigen/src/CMakeLists.txt +++ b/unsupported/Eigen/src/CMakeLists.txt @@ -2,6 +2,7 @@ ADD_SUBDIRECTORY(AutoDiff) ADD_SUBDIRECTORY(BVH) ADD_SUBDIRECTORY(FFT) ADD_SUBDIRECTORY(IterativeSolvers) +ADD_SUBDIRECTORY(LevenbergMarquardt) ADD_SUBDIRECTORY(MatrixFunctions) ADD_SUBDIRECTORY(MoreVectorization) ADD_SUBDIRECTORY(NonLinearOptimization) diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index 0e56342a8..98f9ecc17 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -37,22 +37,31 @@ namespace Eigen { typedef typename Dest::Scalar Scalar; typedef Matrix VectorType; + // Check for zero rhs + const RealScalar rhsNorm2(rhs.squaredNorm()); + if(rhsNorm2 == 0) + { + x.setZero(); + iters = 0; + tol_error = 0; + return; + } + // initialize const int maxIters(iters); // initialize maxIters to iters const int N(mat.cols()); // the size of the matrix - const RealScalar rhsNorm2(rhs.squaredNorm()); const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2) // Initialize preconditioned Lanczos -// VectorType v_old(N); // will be initialized inside loop + VectorType v_old(N); // will be initialized inside loop VectorType v( VectorType::Zero(N) ); //initialize v VectorType v_new(rhs-mat*x); //initialize v_new RealScalar residualNorm2(v_new.squaredNorm()); -// VectorType w(N); // will be initialized inside loop + VectorType w(N); // will be initialized inside loop VectorType w_new(precond.solve(v_new)); // initialize w_new // RealScalar beta; // will be initialized inside loop RealScalar beta_new2(v_new.dot(w_new)); - eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); + eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); RealScalar beta_new(sqrt(beta_new2)); const RealScalar beta_one(beta_new); v_new /= beta_new; @@ -62,14 +71,14 @@ namespace Eigen { RealScalar c_old(1.0); RealScalar s(0.0); // the sine of the Givens rotation RealScalar s_old(0.0); // the sine of the Givens rotation -// VectorType p_oold(N); // will be initialized in loop + VectorType p_oold(N); // will be initialized in loop VectorType p_old(VectorType::Zero(N)); // initialize p_old=0 VectorType p(p_old); // initialize p=0 RealScalar eta(1.0); iters = 0; // reset iters - while ( iters < maxIters ){ - + while ( iters < maxIters ) + { // Preconditioned Lanczos /* Note that there are 4 variants on the Lanczos algorithm. These are * described in Paige, C. C. (1972). Computational variants of @@ -81,17 +90,17 @@ namespace Eigen { * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987). */ const RealScalar beta(beta_new); -// v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter - const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT + v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter +// const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT v = v_new; // update -// w = w_new; // update - const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT + w = w_new; // update +// const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT v_new.noalias() = mat*w - beta*v_old; // compute v_new const RealScalar alpha = v_new.dot(w); v_new -= alpha*v; // overwrite v_new w_new = precond.solve(v_new); // overwrite w_new beta_new2 = v_new.dot(w_new); // compute beta_new - eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); + eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); beta_new = sqrt(beta_new2); // compute beta_new v_new /= beta_new; // overwrite v_new for next iteration w_new /= beta_new; // overwrite w_new for next iteration @@ -107,28 +116,34 @@ namespace Eigen { s=beta_new/r1; // new sine // Update solution -// p_oold = p_old; - const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT + p_oold = p_old; +// const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT p_old = p; p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED? x += beta_one*c*eta*p; + + /* Update the squared residual. Note that this is the estimated residual. + The real residual |Ax-b|^2 may be slightly larger */ residualNorm2 *= s*s; - if ( residualNorm2 < threshold2){ + if ( residualNorm2 < threshold2) + { break; } eta=-s*eta; // update eta iters++; // increment iteration number (for output purposes) } - tol_error = std::sqrt(residualNorm2 / rhsNorm2); // return error. Note that this is the estimated error. The real error |Ax-b|/|b| may be slightly larger + + /* Compute error. Note that this is the estimated error. The real + error |Ax-b|/|b| may be slightly larger */ + tol_error = std::sqrt(residualNorm2 / rhsNorm2); } } template< typename _MatrixType, int _UpLo=Lower, typename _Preconditioner = IdentityPreconditioner> -// typename _Preconditioner = IdentityPreconditioner > // preconditioner must be positive definite class MINRES; namespace internal { diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt index 8513803ce..d9690854d 100644 --- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt +++ b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt @@ -2,5 +2,5 @@ FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h") INSTALL(FILES ${Eigen_LevenbergMarquardt_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/LevenbergMarquardt COMPONENT Devel + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel ) diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index cd5c04bbf..03198ec8e 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -41,7 +41,7 @@ class PolynomialSolverBase protected: template< typename OtherPolynomial > inline void setPolynomial( const OtherPolynomial& poly ){ - m_roots.resize(poly.size()); } + m_roots.resize(poly.size()-1); } public: template< typename OtherPolynomial > @@ -316,7 +316,7 @@ class PolynomialSolverBase * - real roots with greatest, smallest absolute real value. * - greatest, smallest real roots. * - * WARNING: this polynomial solver is experimental, part of the unsuported Eigen modules. + * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules. * * * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of @@ -345,10 +345,19 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> void compute( const OtherPolynomial& poly ) { eigen_assert( Scalar(0) != poly[poly.size()-1] ); - internal::companion companion( poly ); - companion.balance(); - m_eigenSolver.compute( companion.denseMatrix() ); - m_roots = m_eigenSolver.eigenvalues(); + eigen_assert( poly.size() > 1 ); + if(poly.size() > 2 ) + { + internal::companion companion( poly ); + companion.balance(); + m_eigenSolver.compute( companion.denseMatrix() ); + m_roots = m_eigenSolver.eigenvalues(); + } + else if(poly.size () == 2) + { + m_roots.resize(1); + m_roots[0] = -poly[0]/poly[1]; + } } public: @@ -376,10 +385,18 @@ class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1> template< typename OtherPolynomial > void compute( const OtherPolynomial& poly ) { - eigen_assert( Scalar(0) != poly[poly.size()-1] ); - m_roots[0] = -poly[0]/poly[poly.size()-1]; + eigen_assert( poly.size() == 2 ); + eigen_assert( Scalar(0) != poly[1] ); + m_roots[0] = -poly[0]/poly[1]; } + public: + template< typename OtherPolynomial > + inline PolynomialSolver( const OtherPolynomial& poly ){ + compute( poly ); } + + inline PolynomialSolver(){} + protected: using PS_Base::m_roots; }; diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h index 7aafce928..1c40d3f7c 100644 --- a/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h @@ -238,9 +238,9 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy for(int j=0; j void forward_jacobian(const Func& f) VERIFY_IS_APPROX(j, jref); } + +// TODO also check actual derivatives! void test_autodiff_scalar() { - std::cerr << foo(1,2) << "\n"; + Vector2f p = Vector2f::Random(); typedef AutoDiffScalar AD; - AD ax(1,Vector2f::UnitX()); - AD ay(2,Vector2f::UnitY()); + AD ax(p.x(),Vector2f::UnitX()); + AD ay(p.y(),Vector2f::UnitY()); AD res = foo(ax,ay); - std::cerr << res.value() << " <> " - << res.derivatives().transpose() << "\n\n"; + VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y())); } +// TODO also check actual derivatives! void test_autodiff_vector() { - std::cerr << foo(Vector2f(1,2)) << "\n"; + Vector2f p = Vector2f::Random(); typedef AutoDiffScalar AD; typedef Matrix VectorAD; - VectorAD p(AD(1),AD(-1)); - p.x().derivatives() = Vector2f::UnitX(); - p.y().derivatives() = Vector2f::UnitY(); + VectorAD ap = p.cast(); + ap.x().derivatives() = Vector2f::UnitX(); + ap.y().derivatives() = Vector2f::UnitY(); - AD res = foo(p); - std::cerr << res.value() << " <> " - << res.derivatives().transpose() << "\n\n"; + AD res = foo(ap); + VERIFY_IS_APPROX(res.value(), foo(p)); } void test_autodiff_jacobian() { - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); - CALL_SUBTEST(( forward_jacobian(TestFunc1(3,3)) )); - } + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1()) )); + CALL_SUBTEST(( forward_jacobian(TestFunc1(3,3)) )); } void test_autodiff() { - test_autodiff_scalar(); - test_autodiff_vector(); -// test_autodiff_jacobian(); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( test_autodiff_scalar() ); + CALL_SUBTEST_2( test_autodiff_vector() ); + CALL_SUBTEST_3( test_autodiff_jacobian() ); + } } diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index a9843e9a9..af5cadbf9 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -91,18 +91,36 @@ static void test_gen_numeric_list() VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); + VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); VERIFY((is_same::type, numeric_list>::value)); diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp index e8dfffd92..d680e9b3b 100644 --- a/unsupported/test/cxx11_tensor_symmetry.cpp +++ b/unsupported/test/cxx11_tensor_symmetry.cpp @@ -32,8 +32,8 @@ using Eigen::GlobalImagFlag; // helper function to determine if the compiler intantiated a static // or dynamic symmetry group -template -bool isDynGroup(StaticSGroup const& dummy) +template +bool isDynGroup(StaticSGroup const& dummy) { (void)dummy; return false; @@ -86,7 +86,7 @@ static void test_symgroups_static() std::array identity{{0,1,2,3,4,5,6}}; // Simple static symmetry group - StaticSGroup<7, + StaticSGroup< AntiSymmetry<0,1>, Hermiticity<0,2> > group; @@ -113,7 +113,7 @@ static void test_symgroups_dynamic() identity.push_back(i); // Simple dynamic symmetry group - DynamicSGroup group(7); + DynamicSGroup group; group.add(0,1,NegationFlag); group.add(0,2,ConjugationFlag); @@ -143,7 +143,7 @@ static void test_symgroups_selection() { // Do the same test as in test_symgroups_static but // require selection via SGroup - SGroup<7, + SGroup< AntiSymmetry<0,1>, Hermiticity<0,2> > group; @@ -168,7 +168,7 @@ static void test_symgroups_selection() // simple factorizing group: 5 generators, 2^5 = 32 elements // selection should make this dynamic, although static group // can still be reasonably generated - SGroup<10, + SGroup< Symmetry<0,1>, Symmetry<2,3>, Symmetry<4,5>, @@ -196,7 +196,7 @@ static void test_symgroups_selection() // no verify that we could also generate a static group // with these generators found.clear(); - StaticSGroup<10, + StaticSGroup< Symmetry<0,1>, Symmetry<2,3>, Symmetry<4,5>, @@ -211,7 +211,7 @@ static void test_symgroups_selection() { // try to create a HUGE group - SGroup<7, + SGroup< Symmetry<0,1>, Symmetry<1,2>, Symmetry<2,3>, @@ -657,11 +657,11 @@ static void test_symgroups_selection() static void test_tensor_epsilon() { - SGroup<3, AntiSymmetry<0,1>, AntiSymmetry<1,2>> sym; + SGroup, AntiSymmetry<1,2>> sym; Tensor epsilon(3,3,3); epsilon.setZero(); - epsilon.symCoeff(sym, 0, 1, 2) = 1; + sym(epsilon, 0, 1, 2) = 1; for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { @@ -674,7 +674,7 @@ static void test_tensor_epsilon() static void test_tensor_sym() { - SGroup<4, Symmetry<0,1>, Symmetry<2,3>> sym; + SGroup, Symmetry<2,3>> sym; Tensor t(10,10,10,10); t.setZero(); @@ -683,7 +683,7 @@ static void test_tensor_sym() for (int k = l; k < 10; k++) { for (int j = 0; j < 10; j++) { for (int i = j; i < 10; i++) { - t.symCoeff(sym, i, j, k, l) = (i + j) * (k + l); + sym(t, i, j, k, l) = (i + j) * (k + l); } } } @@ -703,7 +703,7 @@ static void test_tensor_sym() static void test_tensor_asym() { - SGroup<4, AntiSymmetry<0,1>, AntiSymmetry<2,3>> sym; + SGroup, AntiSymmetry<2,3>> sym; Tensor t(10,10,10,10); t.setZero(); @@ -712,7 +712,7 @@ static void test_tensor_asym() for (int k = l + 1; k < 10; k++) { for (int j = 0; j < 10; j++) { for (int i = j + 1; i < 10; i++) { - t.symCoeff(sym, i, j, k, l) = ((i * j) + (k * l)); + sym(t, i, j, k, l) = ((i * j) + (k * l)); } } } @@ -740,7 +740,7 @@ static void test_tensor_asym() static void test_tensor_dynsym() { - DynamicSGroup sym(4); + DynamicSGroup sym; sym.addSymmetry(0,1); sym.addSymmetry(2,3); Tensor t(10,10,10,10); @@ -751,7 +751,7 @@ static void test_tensor_dynsym() for (int k = l; k < 10; k++) { for (int j = 0; j < 10; j++) { for (int i = j; i < 10; i++) { - t.symCoeff(sym, i, j, k, l) = (i + j) * (k + l); + sym(t, i, j, k, l) = (i + j) * (k + l); } } } @@ -770,7 +770,7 @@ static void test_tensor_dynsym() static void test_tensor_randacc() { - SGroup<4, Symmetry<0,1>, Symmetry<2,3>> sym; + SGroup, Symmetry<2,3>> sym; Tensor t(10,10,10,10); t.setZero(); @@ -787,7 +787,7 @@ static void test_tensor_randacc() std::swap(i, j); if (k < l) std::swap(k, l); - t.symCoeff(sym, i, j, k, l) = (i + j) * (k + l); + sym(t, i, j, k, l) = (i + j) * (k + l); } for (int l = 0; l < 10; l++) { diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index fd12da548..81b762c37 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -1,8 +1,8 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2011 Gael Guennebaud // Copyright (C) 2012 Giacomo Po +// Copyright (C) 2011 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,19 +14,29 @@ template void test_minres_T() { - MINRES, Lower, DiagonalPreconditioner > minres_colmajor_diag; - MINRES, Lower, IdentityPreconditioner > minres_colmajor_I; -// MINRES, Lower, IncompleteLUT > minres_colmajor_ilut; - //minres, SSORPreconditioner > minres_colmajor_ssor; + // Identity preconditioner + MINRES, Lower, IdentityPreconditioner > minres_colmajor_lower_I; + MINRES, Upper, IdentityPreconditioner > minres_colmajor_upper_I; + + // Diagonal preconditioner + MINRES, Lower, DiagonalPreconditioner > minres_colmajor_lower_diag; + MINRES, Upper, DiagonalPreconditioner > minres_colmajor_upper_diag; + + // call tests for SPD matrix + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_I) ); + + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag) ); + + // TO DO: symmetric semi-definite matrix + // TO DO: symmetric indefinite matrix - CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_diag) ); - CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_I) ); - // CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ilut) ); - //CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ssor) ); } void test_minres() { CALL_SUBTEST_1(test_minres_T()); -// CALL_SUBTEST_2(test_minres_T >()); +// CALL_SUBTEST_2(test_minres_T >()); + } diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp index 680ff6d31..0c87478dd 100644 --- a/unsupported/test/polynomialsolver.cpp +++ b/unsupported/test/polynomialsolver.cpp @@ -38,6 +38,9 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) const Index deg = pols.size()-1; + // Test template constructor from coefficient vector + SOLVER solve_constr (pols); + psolve.compute( pols ); const RootsType& roots( psolve.roots() ); EvalRootsType evr( deg ); @@ -210,5 +213,6 @@ void test_polynomialsolver() CALL_SUBTEST_10((polynomialsolver( internal::random(9,13) )) ); + CALL_SUBTEST_11((polynomialsolver(1)) ); } }