merge with default branch

This commit is contained in:
Gael Guennebaud 2014-06-20 15:55:44 +02:00
commit b29b81a1f4
158 changed files with 4721 additions and 1751 deletions

View File

@ -196,6 +196,18 @@ if(NOT MSVC)
message(STATUS "Enabling SSE4.2 in tests/examples") message(STATUS "Enabling SSE4.2 in tests/examples")
endif() endif()
option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
if(EIGEN_TEST_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
message(STATUS "Enabling AVX in tests/examples")
endif()
option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
if(EIGEN_TEST_FMA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
message(STATUS "Enabling FMA in tests/examples")
endif()
option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF) option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
if(EIGEN_TEST_ALTIVEC) if(EIGEN_TEST_ALTIVEC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
@ -437,12 +449,12 @@ set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} ) set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} ) set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
configure_file ( ${CMAKE_SOURCE_DIR}/cmake/Eigen3Config.cmake.in configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
@ONLY ESCAPE_QUOTES @ONLY ESCAPE_QUOTES
) )
install ( FILES ${CMAKE_SOURCE_DIR}/cmake/UseEigen3.cmake install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
DESTINATION ${EIGEN_CONFIG_CMAKE_PATH} DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
) )

View File

@ -10,9 +10,11 @@
* *
* *
* This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices. * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
* Those decompositions are accessible via the following MatrixBase methods: * Those decompositions are also accessible via the following methods:
* - MatrixBase::llt(), * - MatrixBase::llt()
* - MatrixBase::ldlt() * - MatrixBase::ldlt()
* - SelfAdjointView::llt()
* - SelfAdjointView::ldlt()
* *
* \code * \code
* #include <Eigen/Cholesky> * #include <Eigen/Cholesky>

View File

@ -117,7 +117,16 @@
#ifdef __SSE4_2__ #ifdef __SSE4_2__
#define EIGEN_VECTORIZE_SSE4_2 #define EIGEN_VECTORIZE_SSE4_2
#endif #endif
#ifdef __AVX__
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
// include files // include files
// This extern "C" works around a MINGW-w64 compilation issue // This extern "C" works around a MINGW-w64 compilation issue
@ -130,7 +139,7 @@
extern "C" { extern "C" {
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
#ifdef __INTEL_COMPILER #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
#include <immintrin.h> #include <immintrin.h>
#else #else
#include <emmintrin.h> #include <emmintrin.h>
@ -147,6 +156,9 @@
#ifdef EIGEN_VECTORIZE_SSE4_2 #ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h> #include <nmmintrin.h>
#endif #endif
#ifdef EIGEN_VECTORIZE_AVX
#include <immintrin.h>
#endif
#endif #endif
} // end extern "C" } // end extern "C"
#elif defined __ALTIVEC__ #elif defined __ALTIVEC__
@ -216,7 +228,9 @@
namespace Eigen { namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) { inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_SSE4_2) #if defined(EIGEN_VECTORIZE_AVX)
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_1) #elif defined(EIGEN_VECTORIZE_SSE4_1)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@ -294,7 +308,13 @@ using std::ptrdiff_t;
#include "src/Core/MathFunctions.h" #include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h" #include "src/Core/GenericPacketMath.h"
#if defined EIGEN_VECTORIZE_SSE #if defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/Complex.h"
#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/Complex.h"
#elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/SSE/Complex.h"

View File

@ -47,7 +47,9 @@
#include "src/Geometry/AlignedBox.h" #include "src/Geometry/AlignedBox.h"
#include "src/Geometry/Umeyama.h" #include "src/Geometry/Umeyama.h"
#if defined EIGEN_VECTORIZE_SSE // Use the SSE optimized version whenever possible. At the moment the
// SSE version doesn't compile when AVX is enabled
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/Geometry/arch/Geometry_SSE.h" #include "src/Geometry/arch/Geometry_SSE.h"
#endif #endif
#endif #endif

View File

@ -27,7 +27,9 @@
#include "src/LU/Determinant.h" #include "src/LU/Determinant.h"
#include "src/LU/InverseImpl.h" #include "src/LU/InverseImpl.h"
#if defined EIGEN_VECTORIZE_SSE // Use the SSE optimized version whenever possible. At the moment the
// SSE version doesn't compile when AVX is enabled
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/LU/arch/Inverse_SSE.h" #include "src/LU/arch/Inverse_SSE.h"
#endif #endif

View File

@ -43,7 +43,7 @@ namespace internal {
* Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
* decomposition to determine whether a system of equations has a solution. * decomposition to determine whether a system of equations has a solution.
* *
* \sa MatrixBase::ldlt(), class LLT * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
*/ */
template<typename _MatrixType, int _UpLo> class LDLT template<typename _MatrixType, int _UpLo> class LDLT
{ {
@ -179,7 +179,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
* least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
* computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
* *
* \sa MatrixBase::ldlt() * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
*/ */
#ifdef EIGEN_TEST_EVALUATORS #ifdef EIGEN_TEST_EVALUATORS
template<typename Rhs> template<typename Rhs>
@ -610,6 +610,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
#ifndef __CUDACC__ #ifndef __CUDACC__
/** \cholesky_module /** \cholesky_module
* \returns the Cholesky decomposition with full pivoting without square root of \c *this * \returns the Cholesky decomposition with full pivoting without square root of \c *this
* \sa MatrixBase::ldlt()
*/ */
template<typename MatrixType, unsigned int UpLo> template<typename MatrixType, unsigned int UpLo>
inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@ -620,6 +621,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const
/** \cholesky_module /** \cholesky_module
* \returns the Cholesky decomposition with full pivoting without square root of \c *this * \returns the Cholesky decomposition with full pivoting without square root of \c *this
* \sa SelfAdjointView::ldlt()
*/ */
template<typename Derived> template<typename Derived>
inline const LDLT<typename MatrixBase<Derived>::PlainObject> inline const LDLT<typename MatrixBase<Derived>::PlainObject>

View File

@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp * Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out * Output: \verbinclude LLT_example.out
* *
* \sa MatrixBase::llt(), class LDLT * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/ */
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore, * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@ -115,7 +115,7 @@ template<typename _MatrixType, int _UpLo> class LLT
* Example: \include LLT_solve.cpp * Example: \include LLT_solve.cpp
* Output: \verbinclude LLT_solve.out * Output: \verbinclude LLT_solve.out
* *
* \sa solveInPlace(), MatrixBase::llt() * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
*/ */
#ifdef EIGEN_TEST_EVALUATORS #ifdef EIGEN_TEST_EVALUATORS
template<typename Rhs> template<typename Rhs>
@ -498,6 +498,7 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
#ifndef __CUDACC__ #ifndef __CUDACC__
/** \cholesky_module /** \cholesky_module
* \returns the LLT decomposition of \c *this * \returns the LLT decomposition of \c *this
* \sa SelfAdjointView::llt()
*/ */
template<typename Derived> template<typename Derived>
inline const LLT<typename MatrixBase<Derived>::PlainObject> inline const LLT<typename MatrixBase<Derived>::PlainObject>
@ -508,6 +509,7 @@ MatrixBase<Derived>::llt() const
/** \cholesky_module /** \cholesky_module
* \returns the LLT decomposition of \c *this * \returns the LLT decomposition of \c *this
* \sa SelfAdjointView::llt()
*/ */
template<typename MatrixType, unsigned int UpLo> template<typename MatrixType, unsigned int UpLo>
inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>

View File

@ -105,6 +105,8 @@ public:
EIGEN_DEBUG_VAR(DstIsAligned) EIGEN_DEBUG_VAR(DstIsAligned)
EIGEN_DEBUG_VAR(SrcIsAligned) EIGEN_DEBUG_VAR(SrcIsAligned)
EIGEN_DEBUG_VAR(JointAlignment) EIGEN_DEBUG_VAR(JointAlignment)
EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerSize)
EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(InnerMaxSize)
EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(PacketSize)

View File

@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan, Atan)
//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs) //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp)
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln) EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln)

View File

@ -88,8 +88,8 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0) MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
&& (InnerStrideAtCompileTime == 1) && (InnerStrideAtCompileTime == 1)
? PacketAccessBit : 0, ? PacketAccessBit : 0,
MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0, FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |

View File

@ -40,7 +40,7 @@ void check_static_allocation_size()
*/ */
template <typename T, int Size, int MatrixOrArrayOptions, template <typename T, int Size, int MatrixOrArrayOptions,
int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0 int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
: (((Size*sizeof(T))%16)==0) ? 16 : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
: 0 > : 0 >
struct plain_array struct plain_array
{ {
@ -68,27 +68,27 @@ struct plain_array
template<typename PtrType> template<typename PtrType>
EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; } EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \ eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
&& "this assertion is explained here: " \ && "this assertion is explained here: " \
"http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
" **** READ THIS WEB PAGE !!! ****"); " **** READ THIS WEB PAGE !!! ****");
#else #else
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \ eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
&& "this assertion is explained here: " \ && "this assertion is explained here: " \
"http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
" **** READ THIS WEB PAGE !!! ****"); " **** READ THIS WEB PAGE !!! ****");
#endif #endif
template <typename T, int Size, int MatrixOrArrayOptions> template <typename T, int Size, int MatrixOrArrayOptions>
struct plain_array<T, Size, MatrixOrArrayOptions, 16> struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
{ {
EIGEN_USER_ALIGN16 T array[Size]; EIGEN_USER_ALIGN_DEFAULT T array[Size];
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
plain_array() plain_array()
{ {
EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf); EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
check_static_allocation_size<T,Size>(); check_static_allocation_size<T,Size>();
} }
@ -102,7 +102,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
template <typename T, int MatrixOrArrayOptions, int Alignment> template <typename T, int MatrixOrArrayOptions, int Alignment>
struct plain_array<T, 0, MatrixOrArrayOptions, Alignment> struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
{ {
EIGEN_USER_ALIGN16 T array[1]; EIGEN_USER_ALIGN_DEFAULT T array[1];
EIGEN_DEVICE_FUNC plain_array() {} EIGEN_DEVICE_FUNC plain_array() {}
EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
}; };

View File

@ -69,8 +69,7 @@ template<typename Lhs, typename Rhs> struct product_type
MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime, MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
_Rhs::MaxRowsAtCompileTime), _Rhs::MaxRowsAtCompileTime),
Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
_Rhs::RowsAtCompileTime), _Rhs::RowsAtCompileTime)
LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
}; };
// the splitting into different lines of code here, introducing the _select enums and the typedef below, // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@ -424,7 +423,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data; internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
EIGEN_STRONG_INLINE Scalar* data() { EIGEN_STRONG_INLINE Scalar* data() {
return ForceAlignment return ForceAlignment
? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16) ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
: m_data.array; : m_data.array;
} }
#endif #endif

96
Eigen/src/Core/GenericPacketMath.h Normal file → Executable file
View File

@ -42,6 +42,8 @@ namespace internal {
struct default_packet_traits struct default_packet_traits
{ {
enum { enum {
HasHalfPacket = 0,
HasAdd = 1, HasAdd = 1,
HasSub = 1, HasSub = 1,
HasMul = 1, HasMul = 1,
@ -71,10 +73,12 @@ struct default_packet_traits
template<typename T> struct packet_traits : default_packet_traits template<typename T> struct packet_traits : default_packet_traits
{ {
typedef T type; typedef T type;
typedef T half;
enum { enum {
Vectorizable = 0, Vectorizable = 0,
size = 1, size = 1,
AlignedOnScalar = 0 AlignedOnScalar = 0,
HasHalfPacket = 0
}; };
enum { enum {
HasAdd = 0, HasAdd = 0,
@ -157,17 +161,65 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; } ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
/** \internal \returns a packet with elements of \a *from duplicated. /** \internal \returns a packet with elements of \a *from duplicated.
* For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
* duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]} * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
* Currently, this function is only used for scalar * complex products. * Currently, this function is only used for scalar * complex products.
*/ */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ /** \internal \returns a packet with elements of \a *from quadrupled.
* For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
* replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
* Currently, this function is only used in matrix products.
* For packet-size smaller or equal to 4, this function is equivalent to pload1
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pset1(const typename unpacket_traits<Packet>::type& a) { return a; } ploadquad(const typename unpacket_traits<Packet>::type* from)
{ return pload1<Packet>(from); }
/** \internal equivalent to
* \code
* a0 = pload1(a+0);
* a1 = pload1(a+1);
* a2 = pload1(a+2);
* a3 = pload1(a+3);
* \endcode
* \sa pset1, pload1, ploaddup, pbroadcast2
*/
template<typename Packet> EIGEN_DEVICE_FUNC
inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
Packet& a0, Packet& a1, Packet& a2, Packet& a3)
{
a0 = pload1<Packet>(a+0);
a1 = pload1<Packet>(a+1);
a2 = pload1<Packet>(a+2);
a3 = pload1<Packet>(a+3);
}
/** \internal equivalent to
* \code
* a0 = pload1(a+0);
* a1 = pload1(a+1);
* \endcode
* \sa pset1, pload1, ploaddup, pbroadcast4
*/
template<typename Packet> EIGEN_DEVICE_FUNC
inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
Packet& a0, Packet& a1)
{
a0 = pload1<Packet>(a+0);
a1 = pload1<Packet>(a+1);
}
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Scalar> inline typename packet_traits<Scalar>::type template<typename Scalar> inline typename packet_traits<Scalar>::type
@ -181,6 +233,12 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
{ (*to) = from; } { (*to) = from; }
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/)
{ return ploadu<Packet>(from); }
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/)
{ pstore(to, from); }
/** \internal tries to do cache prefetching of \a addr */ /** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> inline void prefetch(const Scalar* addr) template<typename Scalar> inline void prefetch(const Scalar* addr)
{ {
@ -201,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; }
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a) template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
{ return a; } { return a; }
/** \internal \returns the sum of the elements of \a a by block of 4 elements.
* For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
* For packet-size smaller or equal to 4, this boils down to a noop.
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
predux4(const Packet& a)
{ return a; }
/** \internal \returns the product of the elements of \a a*/ /** \internal \returns the product of the elements of \a a*/
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a) template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
{ return a; } { return a; }
@ -251,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pacos(const Packet& a) { using std::acos; return acos(a); } Packet pacos(const Packet& a) { using std::acos; return acos(a); }
/** \internal \returns the atan of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet patan(const Packet& a) { using std::atan; return atan(a); }
/** \internal \returns the exp of \a a (coeff-wise) */ /** \internal \returns the exp of \a a (coeff-wise) */
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pexp(const Packet& a) { using std::exp; return exp(a); } Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@ -348,9 +419,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const
#endif #endif
/***************************************************************************
* PacketBlock, that is a collection of N packets where the number of words
* in the packet is a multiple of N.
***************************************************************************/
template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
Packet packet[N];
};
template<typename Packet> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_GENERIC_PACKET_MATH_H #endif // EIGEN_GENERIC_PACKET_MATH_H

View File

@ -45,6 +45,7 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)

View File

@ -89,7 +89,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
&& ( bool(IsDynamicSize) && ( bool(IsDynamicSize)
|| HasNoOuterStride || HasNoOuterStride
|| ( OuterStrideAtCompileTime!=Dynamic || ( OuterStrideAtCompileTime!=Dynamic
&& ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ), && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags0 = TraitsBase::Flags & (~NestByRefBit),
Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))

View File

@ -166,11 +166,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit, EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
internal::inner_stride_at_compile_time<Derived>::ret==1), internal::inner_stride_at_compile_time<Derived>::ret==1),
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0) && "data is not aligned"); eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
#else #else
eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % 16) == 0) && "data is not aligned"); eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
#endif #endif
} }
PointerType m_data; PointerType m_data;
@ -255,6 +254,8 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
using Base::Base::operator=; using Base::Base::operator=;
}; };
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_MAPBASE_H #endif // EIGEN_MAPBASE_H

View File

@ -669,6 +669,15 @@ bool (isfinite)(const T& x)
return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest(); return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
} }
template<typename T>
EIGEN_DEVICE_FUNC
bool (isfinite)(const std::complex<T>& x)
{
using std::real;
using std::imag;
return isfinite(real(x)) && isfinite(imag(x));
}
} // end namespace numext } // end namespace numext
namespace internal { namespace internal {

View File

@ -325,11 +325,17 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
}; };
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
{ {
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
if (VectorizedSize > 0) {
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func)); Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
if (VectorizedSize != Size) if (VectorizedSize != Size)
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func)); res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
return res; return res;
} }
else {
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
}
}
}; };
#ifdef EIGEN_ENABLE_EVALUATORS #ifdef EIGEN_ENABLE_EVALUATORS

View File

@ -97,7 +97,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
template<typename Derived> struct match { template<typename Derived> struct match {
enum { enum {
HasDirectAccess = internal::has_direct_access<Derived>::ret, HasDirectAccess = internal::has_direct_access<Derived>::ret,
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
@ -168,6 +168,10 @@ protected:
} }
else else
::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols()); ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
else
::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());
} }

View File

@ -0,0 +1,6 @@
FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
INSTALL(FILES
${Eigen_Core_arch_AVX_SRCS}
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
)

View File

@ -0,0 +1,463 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_AVX_H
#define EIGEN_COMPLEX_AVX_H
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet4cf
{
EIGEN_STRONG_INLINE Packet4cf() {}
EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
__m256 v;
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet4cf type;
typedef Packet2cf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
HasHalfPacket = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
{
return Packet4cf(pnegate(a.v));
}
template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
{
const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
return Packet4cf(_mm256_xor_ps(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
__m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
__m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
__m256 result = _mm256_addsub_ps(tmp1, tmp2);
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
{
return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
}
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
{
// FIXME The following might be optimized using _mm256_movedup_pd
Packet2cf a = ploaddup<Packet2cf>(from);
Packet2cf b = ploaddup<Packet2cf>(from+1);
return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
}
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
{
return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
std::imag(from[2*stride]), std::real(from[2*stride]),
std::imag(from[1*stride]), std::real(from[1*stride]),
std::imag(from[0*stride]), std::real(from[0*stride])));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
{
__m128 low = _mm256_extractf128_ps(from.v, 0);
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
__m128 high = _mm256_extractf128_ps(from.v, 1);
to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
}
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
{
return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
}
template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
__m128 low = _mm256_extractf128_ps(a.v, 0);
__m128 high = _mm256_extractf128_ps(a.v, 1);
__m128d lowd = _mm_castps_pd(low);
__m128d highd = _mm_castps_pd(high);
low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
__m256 result = _mm256_setzero_ps();
result = _mm256_insertf128_ps(result, low, 1);
result = _mm256_insertf128_ps(result, high, 0);
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
{
return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
Packet2cf(_mm256_extractf128_ps(a.v,1))));
}
template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
{
Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
t0 = _mm256_hadd_ps(t0,t1);
Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
t2 = _mm256_hadd_ps(t2,t3);
t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
return Packet4cf(_mm256_add_ps(t1,t3));
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
{
return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
Packet2cf(_mm256_extractf128_ps(a.v, 1))));
}
template<int Offset>
struct palign_impl<Offset,Packet4cf>
{
static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
{
if (Offset==0) return;
palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
Packet4cf num = pmul(a, pconj(b));
__m256 tmp = _mm256_mul_ps(b.v, b.v);
__m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1);
__m256 denom = _mm256_add_ps(tmp, tmp2);
return Packet4cf(_mm256_div_ps(num.v, denom));
}
template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
{
return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
}
//---------- double ----------
struct Packet2cd
{
EIGEN_STRONG_INLINE Packet2cd() {}
EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
__m256d v;
};
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet2cd type;
typedef Packet1cd half;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 2,
HasHalfPacket = 1,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
{
const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
return Packet2cd(_mm256_xor_pd(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
__m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
__m256d even = _mm256_mul_pd(tmp1, b.v);
__m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
__m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
__m256d odd = _mm256_mul_pd(tmp2, tmp3);
return Packet2cd(_mm256_addsub_pd(even, odd));
}
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
{
// in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
}
template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride)
{
return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
std::imag(from[0*stride]), std::real(from[0*stride])));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride)
{
__m128d low = _mm256_extractf128_pd(from.v, 0);
to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
__m128d high = _mm256_extractf128_pd(from.v, 1);
to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
}
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
{
__m128d low = _mm256_extractf128_pd(a.v, 0);
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, low);
return std::complex<double>(res[0],res[1]);
}
template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
__m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
return Packet2cd(result);
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
{
return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
Packet1cd(_mm256_extractf128_pd(a.v,1))));
}
template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
{
Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
return Packet2cd(_mm256_add_pd(t0,t1));
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
{
return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
Packet1cd(_mm256_extractf128_pd(a.v,1))));
}
template<int Offset>
struct palign_impl<Offset,Packet2cd>
{
static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
{
if (Offset==0) return;
palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
Packet2cd num = pmul(a, pconj(b));
__m256d tmp = _mm256_mul_pd(b.v, b.v);
__m256d denom = _mm256_hadd_pd(tmp, tmp);
return Packet2cd(_mm256_div_pd(num.v, denom));
}
template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
{
return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4cf,4>& kernel) {
__m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
__m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
__m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
__m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
__m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
__m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
__m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
__m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet2cd,2>& kernel) {
__m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
kernel.packet[0].v = tmp;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_AVX_H

View File

@ -0,0 +1,564 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_AVX_H
#define EIGEN_PACKET_MATH_AVX_H
namespace Eigen {
namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#endif
#endif
typedef __m256 Packet8f;
typedef __m256i Packet8i;
typedef __m256d Packet4d;
template<> struct is_arithmetic<__m256> { enum { value = true }; };
template<> struct is_arithmetic<__m256i> { enum { value = true }; };
template<> struct is_arithmetic<__m256d> { enum { value = true }; };
#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
const Packet8f p8f_##NAME = pset1<Packet8f>(X)
#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
const Packet4d p4d_##NAME = pset1<Packet4d>(X)
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet8f type;
typedef Packet4f half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8,
HasHalfPacket = 1,
HasDiv = 1,
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef Packet4d type;
typedef Packet2d half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasHalfPacket = 1,
HasDiv = 1,
HasExp = 0
};
};
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
use SSE instructions and packets to deal with integers.
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet8i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; };
template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
{
return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
{
return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by AVX");
return pset1<Packet8i>(0);
}
#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
#if defined(__clang__) || defined(__GNUC__)
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
// and gcc stupidly generates a vfmadd132ps instruction,
// so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
// the result of the product.
Packet8f res = c;
asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
#else
return _mm256_fmadd_ps(a,b,c);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
#if defined(__clang__) || defined(__GNUC__)
// see above
Packet4d res = c;
asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
#else
return _mm256_fmadd_pd(a,b,c);
#endif
}
#endif
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
{
// TODO try to find a way to avoid the need of a temporary register
// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
// return _mm256_unpacklo_ps(tmp,tmp);
// _mm256_insertf128_ps is very slow on Haswell, thus:
Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
// mimic an "inplace" permutation of the lower 128bits using a blend
tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
// then we can perform a consistent permutation on the global register to get everything in shape:
return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
}
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
{
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
return _mm256_permute_pd(tmp, 3<<2);
}
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
{
Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
{
return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
{
return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride)
{
__m128 low = _mm256_extractf128_ps(from, 0);
to[stride*0] = _mm_cvtss_f32(low);
to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
__m128 high = _mm256_extractf128_ps(from, 1);
to[stride*4] = _mm_cvtss_f32(high);
to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride)
{
__m128d low = _mm256_extractf128_pd(from, 0);
to[stride*0] = _mm_cvtsd_f64(low);
to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
__m128d high = _mm256_extractf128_pd(from, 1);
to[stride*2] = _mm_cvtsd_f64(high);
to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
{
Packet8f pa = pset1<Packet8f>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
{
Packet4d pa = pset1<Packet4d>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
{
Packet8i pa = pset1<Packet8i>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
}
template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
}
template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
{
__m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
return _mm256_permute2f128_ps(tmp, tmp, 1);
}
template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{
__m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1);
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5);
}
// pabs should be ok
template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
{
const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
return _mm256_and_ps(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
{
const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
return _mm256_and_pd(a,mask);
}
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
{
__m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
__m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
__m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
__m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
__m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
__m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
__m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
__m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
__m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
__m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
__m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
__m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
__m256 sum1 = _mm256_add_ps(perm1, hsum5);
__m256 sum2 = _mm256_add_ps(perm2, hsum6);
__m256 sum3 = _mm256_add_ps(perm3, hsum7);
__m256 sum4 = _mm256_add_ps(perm4, hsum8);
__m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
__m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
__m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
return final;
}
template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
{
Packet4d tmp0, tmp1;
tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
return _mm256_blend_pd(tmp0, tmp1, 0xC);
}
template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
{
Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
tmp0 = _mm256_hadd_ps(tmp0,tmp0);
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
}
template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
{
Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
}
template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
{
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
}
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
{
Packet8f tmp;
tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
{
Packet4d tmp;
tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
{
Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
{
Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
{
Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
{
Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
template<int Offset>
struct palign_impl<Offset,Packet8f>
{
static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
{
if (Offset==1)
{
first = _mm256_blend_ps(first, second, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
}
else if (Offset==2)
{
first = _mm256_blend_ps(first, second, 3);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
}
else if (Offset==3)
{
first = _mm256_blend_ps(first, second, 7);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
}
else if (Offset==4)
{
first = _mm256_blend_ps(first, second, 15);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
}
else if (Offset==5)
{
first = _mm256_blend_ps(first, second, 31);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0x88);
}
else if (Offset==6)
{
first = _mm256_blend_ps(first, second, 63);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0xcc);
}
else if (Offset==7)
{
first = _mm256_blend_ps(first, second, 127);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0xee);
}
}
};
template<int Offset>
struct palign_impl<Offset,Packet4d>
{
static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
{
if (Offset==1)
{
first = _mm256_blend_pd(first, second, 1);
__m256d tmp = _mm256_permute_pd(first, 5);
first = _mm256_permute2f128_pd(tmp, tmp, 1);
first = _mm256_blend_pd(tmp, first, 0xA);
}
else if (Offset==2)
{
first = _mm256_blend_pd(first, second, 3);
first = _mm256_permute2f128_pd(first, first, 1);
}
else if (Offset==3)
{
first = _mm256_blend_pd(first, second, 7);
__m256d tmp = _mm256_permute_pd(first, 5);
first = _mm256_permute2f128_pd(tmp, tmp, 1);
first = _mm256_blend_pd(tmp, first, 5);
}
}
};
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8f,8>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
__m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
__m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
__m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
__m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
__m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
__m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
__m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
__m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
__m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
__m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
__m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
__m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8f,4>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
__m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
__m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
__m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
__m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4d,4>& kernel) {
__m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
__m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
__m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
__m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_AVX_H

View File

@ -21,6 +21,8 @@ static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);
static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
//---------- float ---------- //---------- float ----------
struct Packet2cf struct Packet2cf
@ -33,6 +35,7 @@ struct Packet2cf
template<> struct packet_traits<std::complex<float> > : default_packet_traits template<> struct packet_traits<std::complex<float> > : default_packet_traits
{ {
typedef Packet2cf type; typedef Packet2cf type;
typedef Packet2cf half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
@ -51,7 +54,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
}; };
}; };
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{ {
@ -65,6 +68,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
return res; return res;
} }
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
{
std::complex<float> EIGEN_ALIGN16 af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return Packet2cf(vec_ld(0, (const float*)af));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
{
std::complex<float> EIGEN_ALIGN16 af[2];
vec_st(from.v, 0, (float*)af);
to[0*stride] = af[0];
to[1*stride] = af[1];
}
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
@ -210,6 +229,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x
return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
} }
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
{
Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0);
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1);
kernel.packet[0].v = tmp;
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -18,6 +18,10 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif #endif
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#endif
#ifndef EIGEN_HAS_FUSE_CJMADD #ifndef EIGEN_HAS_FUSE_CJMADD
#define EIGEN_HAS_FUSE_CJMADD 1 #define EIGEN_HAS_FUSE_CJMADD 1
#endif #endif
@ -73,6 +77,7 @@ static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)
template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<float> : default_packet_traits
{ {
typedef Packet4f type; typedef Packet4f type;
typedef Packet4f half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
@ -89,6 +94,7 @@ template<> struct packet_traits<float> : default_packet_traits
template<> struct packet_traits<int> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits
{ {
typedef Packet4i type; typedef Packet4i type;
typedef Packet4i half;
enum { enum {
// FIXME check the Has* // FIXME check the Has*
Vectorizable = 1, Vectorizable = 1,
@ -97,8 +103,8 @@ template<> struct packet_traits<int> : default_packet_traits
}; };
}; };
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
/* /*
inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
{ {
@ -144,6 +150,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
return s; return s;
} }
*/ */
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
float EIGEN_ALIGN16 af[4]; float EIGEN_ALIGN16 af[4];
@ -161,6 +168,65 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
return vc; return vc;
} }
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
{
a3 = vec_ld(0,a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
a2 = vec_splat(a3, 2);
a3 = vec_splat(a3, 3);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4i>(const int *a,
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
{
a3 = vec_ld(0,a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
a2 = vec_splat(a3, 2);
a3 = vec_splat(a3, 3);
}
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
{
float EIGEN_ALIGN16 af[4];
af[0] = from[0*stride];
af[1] = from[1*stride];
af[2] = from[2*stride];
af[3] = from[3*stride];
return vec_ld(0, af);
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
{
int EIGEN_ALIGN16 ai[4];
ai[0] = from[0*stride];
ai[1] = from[1*stride];
ai[2] = from[2*stride];
ai[3] = from[3*stride];
return vec_ld(0, ai);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
{
float EIGEN_ALIGN16 af[4];
vec_st(from, 0, af);
to[0*stride] = af[0];
to[1*stride] = af[1];
to[2*stride] = af[2];
to[3*stride] = af[3];
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
{
int EIGEN_ALIGN16 ai[4];
vec_st(from, 0, ai);
to[0*stride] = ai[0];
to[1*stride] = ai[1];
to[2*stride] = ai[2];
to[3*stride] = ai[3];
}
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
@ -494,6 +560,32 @@ struct palign_impl<Offset,Packet4i>
} }
}; };
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4f,4>& kernel) {
Packet4f t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
kernel.packet[0] = vec_mergeh(t0, t2);
kernel.packet[1] = vec_mergel(t0, t2);
kernel.packet[2] = vec_mergeh(t1, t3);
kernel.packet[3] = vec_mergel(t1, t3);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
Packet4i t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
kernel.packet[0] = vec_mergeh(t0, t2);
kernel.packet[1] = vec_mergel(t0, t2);
kernel.packet[2] = vec_mergeh(t1, t3);
kernel.packet[3] = vec_mergel(t1, t3);
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -1,4 +1,5 @@
ADD_SUBDIRECTORY(SSE) ADD_SUBDIRECTORY(SSE)
ADD_SUBDIRECTORY(AltiVec) ADD_SUBDIRECTORY(AltiVec)
ADD_SUBDIRECTORY(NEON) ADD_SUBDIRECTORY(NEON)
ADD_SUBDIRECTORY(AVX)
ADD_SUBDIRECTORY(Default) ADD_SUBDIRECTORY(Default)

View File

@ -28,6 +28,7 @@ struct Packet2cf
template<> struct packet_traits<std::complex<float> > : default_packet_traits template<> struct packet_traits<std::complex<float> > : default_packet_traits
{ {
typedef Packet2cf type; typedef Packet2cf type;
typedef Packet2cf half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
@ -46,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
}; };
}; };
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{ {
@ -110,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
{
Packet4f res;
res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
return Packet2cf(res);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
{
to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
@ -246,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
} }
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet2cf,2>& kernel) {
float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
kernel.packet[1].v = tmp;
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -66,6 +66,7 @@ typedef uint32x4_t Packet4ui;
template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<float> : default_packet_traits
{ {
typedef Packet4f type; typedef Packet4f type;
typedef Packet4f half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
@ -83,6 +84,7 @@ template<> struct packet_traits<float> : default_packet_traits
template<> struct packet_traits<int> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits
{ {
typedef Packet4i type; typedef Packet4i type;
typedef Packet4i half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
@ -95,12 +97,13 @@ template<> struct packet_traits<int> : default_packet_traits
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
#endif #endif
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
@ -219,6 +222,40 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
{
Packet4f res;
res = vsetq_lane_f32(from[0*stride], res, 0);
res = vsetq_lane_f32(from[1*stride], res, 1);
res = vsetq_lane_f32(from[2*stride], res, 2);
res = vsetq_lane_f32(from[3*stride], res, 3);
return res;
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
{
Packet4i res;
res = vsetq_lane_s32(from[0*stride], res, 0);
res = vsetq_lane_s32(from[1*stride], res, 1);
res = vsetq_lane_s32(from[2*stride], res, 2);
res = vsetq_lane_s32(from[3*stride], res, 3);
return res;
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
{
to[stride*0] = vgetq_lane_f32(from, 0);
to[stride*1] = vgetq_lane_f32(from, 1);
to[stride*2] = vgetq_lane_f32(from, 2);
to[stride*3] = vgetq_lane_f32(from, 3);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
{
to[stride*0] = vgetq_lane_s32(from, 0);
to[stride*1] = vgetq_lane_s32(from, 1);
to[stride*2] = vgetq_lane_s32(from, 2);
to[stride*3] = vgetq_lane_s32(from, 3);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); }
@ -385,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
a_lo = vget_low_s32(a); a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a); a_hi = vget_high_s32(a);
max = vpmax_s32(a_lo, a_hi); max = vpmax_s32(a_lo, a_hi);
max = vpmax_s32(max, max);
return vget_lane_s32(max, 0); return vget_lane_s32(max, 0);
} }
@ -413,6 +451,27 @@ PALIGN_NEON(3,Packet4i,vextq_s32)
#undef PALIGN_NEON #undef PALIGN_NEON
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4f,4>& kernel) {
float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -22,13 +22,18 @@ struct Packet2cf
__m128 v; __m128 v;
}; };
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<std::complex<float> > : default_packet_traits template<> struct packet_traits<std::complex<float> > : default_packet_traits
{ {
typedef Packet2cf type; typedef Packet2cf type;
typedef Packet2cf half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
size = 2, size = 2,
HasHalfPacket = 0,
HasAdd = 1, HasAdd = 1,
HasSub = 1, HasSub = 1,
@ -42,8 +47,9 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasSetLinear = 0 HasSetLinear = 0
}; };
}; };
#endif
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
{
return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
std::imag(from[0*stride]), std::real(from[0*stride])));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
{
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
#endif #endif
} }
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
{ {
@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{ return padd(c, pmul(x,y)); } { return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(Eigen::internal::pmul(x, y.v)); } { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
}; };
template<> struct conj_helper<Packet2cf, Packet4f, false,false> template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{ return padd(c, pmul(x,y)); } { return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(Eigen::internal::pmul(x.v, y)); } { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
}; };
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@ -248,13 +269,18 @@ struct Packet1cd
__m128d v; __m128d v;
}; };
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<std::complex<double> > : default_packet_traits template<> struct packet_traits<std::complex<double> > : default_packet_traits
{ {
typedef Packet1cd type; typedef Packet1cd type;
typedef Packet1cd half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 0, AlignedOnScalar = 0,
size = 1, size = 1,
HasHalfPacket = 0,
HasAdd = 1, HasAdd = 1,
HasSub = 1, HasSub = 1,
@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
HasSetLinear = 0 HasSetLinear = 0
}; };
}; };
#endif
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; }; template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
{ {
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@ -311,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
// FIXME force unaligned store, this is a temporary fix // FIXME force unaligned store, this is a temporary fix
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
@ -410,7 +437,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{ return padd(c, pmul(x,y)); } { return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(Eigen::internal::pmul(x, y.v)); } { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
}; };
template<> struct conj_helper<Packet1cd, Packet2d, false,false> template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@ -419,7 +446,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{ return padd(c, pmul(x,y)); } { return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(Eigen::internal::pmul(x.v, y)); } { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
}; };
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@ -432,7 +459,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
{ {
return Packet1cd(preverse(x.v)); return Packet1cd(preverse(Packet2d(x.v)));
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet2cf,2>& kernel) {
__m128d w1 = _mm_castps_pd(kernel.packet[0].v);
__m128d w2 = _mm_castps_pd(kernel.packet[1].v);
__m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
kernel.packet[1].v = tmp;
} }
} // end namespace internal } // end namespace internal

View File

@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
x = _mm_or_ps(x, p4f_half); x = _mm_or_ps(x, p4f_half);
emm0 = _mm_sub_epi32(emm0, p4i_0x7f); emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1); Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
/* part2: /* part2:
if( x < SQRTHF ) { if( x < SQRTHF ) {
@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
} else { x = x - 1.0; } } else { x = x - 1.0; }
*/ */
Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
Packet4f tmp = _mm_and_ps(x, mask); Packet4f tmp = pand(x, mask);
x = psub(x, p4f_1); x = psub(x, p4f_1);
e = psub(e, _mm_and_ps(p4f_1, mask)); e = psub(e, pand(p4f_1, mask));
x = padd(x, tmp); x = padd(x, tmp);
Packet4f x2 = pmul(x,x); Packet4f x2 = pmul(x,x);
@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, p4i_0x7f); emm0 = _mm_add_epi32(emm0, p4i_0x7f);
emm0 = _mm_slli_epi32(emm0, 23); emm0 = _mm_slli_epi32(emm0, 23);
return pmul(y, _mm_castsi128_ps(emm0)); return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
} }
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet2d pexp<Packet2d>(const Packet2d& _x) Packet2d pexp<Packet2d>(const Packet2d& _x)
@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
emm0 = _mm_add_epi32(emm0, p4i_1023_0); emm0 = _mm_add_epi32(emm0, p4i_1023_0);
emm0 = _mm_slli_epi32(emm0, 20); emm0 = _mm_slli_epi32(emm0, 20);
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
return pmul(x, _mm_castsi128_pd(emm0)); return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
} }
/* evaluation of 4 sines at onces, using SSE2 intrinsics. /* evaluation of 4 sines at onces, using SSE2 intrinsics.

209
Eigen/src/Core/arch/SSE/PacketMath.h Normal file → Executable file
View File

@ -22,9 +22,41 @@ namespace internal {
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif #endif
#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#endif
#endif
#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
// have overloads for both types without linking error.
// One solution is to increase ABI version using -fabi-version=4 (or greater).
// To workaround this inconvenince, we rather wrap 128bit types into the following helper
// structure:
// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
template<typename T>
struct eigen_packet_wrapper
{
EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
m_val = v;
return *this;
}
T m_val;
};
typedef eigen_packet_wrapper<__m128> Packet4f;
typedef eigen_packet_wrapper<__m128i> Packet4i;
typedef eigen_packet_wrapper<__m128d> Packet2d;
#else
typedef __m128 Packet4f; typedef __m128 Packet4f;
typedef __m128i Packet4i; typedef __m128i Packet4i;
typedef __m128d Packet2d; typedef __m128d Packet2d;
#endif
template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128> { enum { value = true }; };
template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@ -58,13 +90,18 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
const Packet4i p4i_##NAME = pset1<Packet4i>(X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<float> : default_packet_traits template<> struct packet_traits<float> : default_packet_traits
{ {
typedef Packet4f type; typedef Packet4f type;
typedef Packet4f half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
size=4, size=4,
HasHalfPacket = 0,
HasDiv = 1, HasDiv = 1,
HasSin = EIGEN_FAST_MATH, HasSin = EIGEN_FAST_MATH,
@ -77,19 +114,23 @@ template<> struct packet_traits<float> : default_packet_traits
template<> struct packet_traits<double> : default_packet_traits template<> struct packet_traits<double> : default_packet_traits
{ {
typedef Packet2d type; typedef Packet2d type;
typedef Packet2d half;
enum { enum {
Vectorizable = 1, Vectorizable = 1,
AlignedOnScalar = 1, AlignedOnScalar = 1,
size=2, size=2,
HasHalfPacket = 0,
HasDiv = 1, HasDiv = 1,
HasExp = 1, HasExp = 1,
HasSqrt = 1 HasSqrt = 1
}; };
}; };
#endif
template<> struct packet_traits<int> : default_packet_traits template<> struct packet_traits<int> : default_packet_traits
{ {
typedef Packet4i type; typedef Packet4i type;
typedef Packet4i half;
enum { enum {
// FIXME check the Has* // FIXME check the Has*
Vectorizable = 1, Vectorizable = 1,
@ -98,9 +139,9 @@ template<> struct packet_traits<int> : default_packet_traits
}; };
}; };
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; }; template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
#if defined(_MSC_VER) && (_MSC_VER==1500) #if defined(_MSC_VER) && (_MSC_VER==1500)
// Workaround MSVC 9 internal compiler error. // Workaround MSVC 9 internal compiler error.
@ -110,13 +151,26 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
#else #else
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif #endif
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
// Also note that with AVX, we want it to generate a vbroadcastss.
#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
}
#endif
#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
#endif
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
@ -139,7 +193,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
} }
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
{ {
return psub(_mm_setr_epi32(0,0,0,0), a); return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
} }
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@ -173,6 +227,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
// for some weird raisons, it has to be overloaded for packet of integers // for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
#ifdef EIGEN_VECTORIZE_FMA
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
#endif
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@ -218,7 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
#if defined(_MSC_VER) #if defined(_MSC_VER)
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
@ -236,7 +294,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
#endif #endif
} }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
#else #else
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
// require pointer casting to incompatible pointer types and leads to invalid code // require pointer casting to incompatible pointer types and leads to invalid code
@ -245,14 +303,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
// TODO: do the same for MSVC (ICC is compatible) // TODO: do the same for MSVC (ICC is compatible)
// NOTE: with the code below, MSVC's compiler crashes! // NOTE: with the code below, MSVC's compiler crashes!
#if defined(__GNUC__) && defined(__i386__) #if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8)))
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
#define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
#elif defined(__clang__) #elif defined(__clang__)
// bug 201: Segfaults in __mm_loadh_pd with clang 2.8 // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
#define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
#else #else
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0 #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
#define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
#endif #endif
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
@ -283,7 +344,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{ {
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
#else #else
__m128d res; __m128d res;
res = _mm_load_sd((const double*)(from)) ; res = _mm_load_sd((const double*)(from)) ;
@ -302,38 +363,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{ {
Packet4i tmp; Packet4i tmp;
tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from)); tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
return vec4i_swizzle1(tmp, 0, 0, 1, 1); return vec4i_swizzle1(tmp, 0, 0, 1, 1);
} }
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
EIGEN_DEBUG_UNALIGNED_STORE EIGEN_DEBUG_UNALIGNED_STORE
#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
_mm_storeu_pd(to, from);
#else
_mm_storel_pd((to), from); _mm_storel_pd((to), from);
_mm_storeh_pd((to+1), from); _mm_storeh_pd((to+1), from);
#endif
}
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
{
return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
{
return _mm_set_pd(from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
{
return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
{
to[stride*0] = _mm_cvtss_f32(from);
to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
{
to[stride*0] = _mm_cvtsd_f64(from);
to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
{
to[stride*0] = _mm_cvtsi128_si32(from);
to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
} }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
// some compilers might be tempted to perform multiple moves instead of using a vector path. // some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
{ {
Packet4f pa = _mm_set_ss(a); Packet4f pa = _mm_set_ss(a);
pstore(to, vec4f_swizzle1(pa,0,0,0,0)); pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
} }
// some compilers might be tempted to perform multiple moves instead of using a vector path. // some compilers might be tempted to perform multiple moves instead of using a vector path.
template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
{ {
Packet2d pa = _mm_set_sd(a); Packet2d pa = _mm_set_sd(a);
pstore(to, vec2d_swizzle1(pa,0,0)); pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
} }
#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
#endif
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
@ -380,6 +480,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
#endif #endif
} }
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
{
a3 = pload<Packet4f>(a);
a0 = vec4f_swizzle1(a3, 0,0,0,0);
a1 = vec4f_swizzle1(a3, 1,1,1,1);
a2 = vec4f_swizzle1(a3, 2,2,2,2);
a3 = vec4f_swizzle1(a3, 3,3,3,3);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet2d>(const double *a,
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
{
#ifdef EIGEN_VECTORIZE_SSE3
a0 = _mm_loaddup_pd(a+0);
a1 = _mm_loaddup_pd(a+1);
a2 = _mm_loaddup_pd(a+2);
a3 = _mm_loaddup_pd(a+3);
#else
a1 = pload<Packet2d>(a);
a0 = vec2d_swizzle1(a1, 0,0);
a1 = vec2d_swizzle1(a1, 1,1);
a3 = pload<Packet2d>(a+2);
a2 = vec2d_swizzle1(a3, 0,0);
a3 = vec2d_swizzle1(a3, 1,1);
#endif
}
#endif
EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
{ {
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
@ -407,10 +539,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
{ {
Packet4f tmp0 = _mm_hadd_ps(a,a); Packet4f tmp0 = _mm_hadd_ps(a,a);
return pfirst(_mm_hadd_ps(tmp0, tmp0)); return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
} }
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
// SSSE3 version: // SSSE3 version:
// EIGEN_STRONG_INLINE float predux(const Packet4i& a) // EIGEN_STRONG_INLINE float predux(const Packet4i& a)
@ -453,7 +585,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
{ {
Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
} }
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@ -476,11 +608,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
{ {
Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
} }
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
{ {
return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
} }
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
{ {
@ -496,17 +628,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{ {
Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
} }
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
{ {
return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
} }
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{ {
#ifdef EIGEN_VECTORIZE_SSE4_1 #ifdef EIGEN_VECTORIZE_SSE4_1
Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
#else #else
// after some experiments, it is seems this is the fastest way to implement it // after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!) // for GCC (eg., it does not like using std::min after the pstore !!)
@ -522,17 +654,17 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{ {
Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
} }
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
{ {
return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
} }
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{ {
#ifdef EIGEN_VECTORIZE_SSE4_1 #ifdef EIGEN_VECTORIZE_SSE4_1
Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
#else #else
// after some experiments, it is seems this is the fastest way to implement it // after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!) // for GCC (eg., it does not like using std::min after the pstore !!)
@ -652,6 +784,31 @@ struct palign_impl<Offset,Packet2d>
}; };
#endif #endif
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4f,4>& kernel) {
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet2d,2>& kernel) {
__m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
kernel.packet[1] = tmp;
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
__m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
__m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -320,6 +320,26 @@ struct functor_traits<scalar_asin_op<Scalar> >
}; };
}; };
/** \internal
* \brief Template functor to compute the atan of a scalar
* \sa class CwiseUnaryOp, ArrayBase::atan()
*/
template<typename Scalar> struct scalar_atan_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
typedef typename packet_traits<Scalar>::type Packet;
inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
};
template<typename Scalar>
struct functor_traits<scalar_atan_op<Scalar> >
{
enum {
Cost = 5 * NumTraits<Scalar>::MulCost,
PacketAccess = packet_traits<Scalar>::HasATan
};
};
/** \internal /** \internal
* \brief Template functor to compute the inverse of a scalar * \brief Template functor to compute the inverse of a scalar
* \sa class CwiseUnaryOp, Cwise::inverse() * \sa class CwiseUnaryOp, Cwise::inverse()

File diff suppressed because it is too large Load Diff

View File

@ -23,6 +23,8 @@ template<
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
{ {
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run( static EIGEN_STRONG_INLINE void run(
Index rows, Index cols, Index depth, Index rows, Index cols, Index depth,
@ -51,6 +53,8 @@ template<
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
{ {
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static void run(Index rows, Index cols, Index depth, static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride, const LhsScalar* _lhs, Index lhsStride,
@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth,
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
Index kc = blocking.kc(); // cache block size along the K direction Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
//Index nc = blocking.nc(); // cache block size along the N direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@ -80,13 +82,11 @@ static void run(Index rows, Index cols, Index depth,
Index tid = omp_get_thread_num(); Index tid = omp_get_thread_num();
Index threads = omp_get_num_threads(); Index threads = omp_get_num_threads();
std::size_t sizeA = kc*mc; LhsScalar* blockA = blocking.blockA();
std::size_t sizeW = kc*Traits::WorkSpaceFactor; eigen_internal_assert(blockA!=0);
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
RhsScalar* blockB = blocking.blockB(); std::size_t sizeB = kc*nc;
eigen_internal_assert(blockB!=0); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
for(Index k=0; k<depth; k+=kc) for(Index k=0; k<depth; k+=kc)
@ -94,54 +94,56 @@ static void run(Index rows, Index cols, Index depth,
const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A' const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
// In order to reduce the chance that a thread has to wait for the other, // In order to reduce the chance that a thread has to wait for the other,
// let's start by packing A'. // let's start by packing B'.
pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
// Pack B_k to B' in a parallel fashion: // Pack A_k to A' in a parallel fashion:
// each thread packs the sub block B_k,j to B'_j where j is the thread id. // each thread packs the sub block A_k,i to A'_i where i is the thread id.
// However, before copying to B'_j, we have to make sure that no other thread is still using it, // However, before copying to A'_i, we have to make sure that no other thread is still using it,
// i.e., we test that info[tid].users equals 0. // i.e., we test that info[tid].users equals 0.
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while(info[tid].users!=0) {} while(info[tid].users!=0) {}
info[tid].users += threads; info[tid].users += threads;
pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
// Notify the other threads that the part B'_j is ready to go. // Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k; info[tid].sync = k;
// Computes C_i += A' * B' per B'_j // Computes C_i += A' * B' per A'_i
for(Index shift=0; shift<threads; ++shift) for(Index shift=0; shift<threads; ++shift)
{ {
Index j = (tid+shift)%threads; Index i = (tid+shift)%threads;
// At this point we have to make sure that B'_j has been updated by the thread j, // At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access. // we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread! // However, no need to wait for the B' part which has been updated by the current thread!
if(shift>0) if(shift>0)
while(info[j].sync!=k) {} while(info[i].sync!=k) {}
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
} }
// Then keep going as usual with the remaining A' // Then keep going as usual with the remaining B'
for(Index i=mc; i<rows; i+=mc) for(Index j=nc; j<cols; j+=nc)
{ {
const Index actual_mc = (std::min)(i+mc,rows)-i; const Index actual_nc = (std::min)(j+nc,cols)-j;
// pack A_i,k to A' // pack B_k,j to B'
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
// C_i += A' * B' // C_j += A' * B'
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
} }
// Release all the sub blocks B'_j of B' for the current thread, // Release all the sub blocks A'_i of A' for the current thread,
// i.e., we simply decrement the number of users by 1 // i.e., we simply decrement the number of users by 1
for(Index j=0; j<threads; ++j) #pragma omp critical
{
for(Index i=0; i<threads; ++i)
#pragma omp atomic #pragma omp atomic
--(info[j].users); --(info[i].users);
}
} }
} }
else else
@ -151,38 +153,34 @@ static void run(Index rows, Index cols, Index depth,
// this is the sequential version! // this is the sequential version!
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*nc;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
// For each horizontal panel of the rhs, and corresponding panel of the lhs... // For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
for(Index k2=0; k2<depth; k2+=kc) for(Index k2=0; k2<depth; k2+=kc)
{ {
const Index actual_kc = (std::min)(k2+kc,depth)-k2; const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack rhs's panel into a sequential chunk of memory (L2 caching) // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's // Note that this panel will be read as many times as the number of blocks in the rhs's
// vertical panel which is, in practice, a very low number. // horizontal panel which is, in practice, a very low number.
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
// For each mc x kc block of the lhs's vertical panel... // For each kc x nc block of the rhs's horizontal panel...
// (==GEPP_VAR1) for(Index j2=0; j2<cols; j2+=nc)
for(Index i2=0; i2<rows; i2+=mc)
{ {
const Index actual_mc = (std::min)(i2+mc,rows)-i2; const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the lhs's block into a sequential chunk of memory (L1 caching) // We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of // Note that this block will be read a very high number of times, which is equal to the number of
// micro vertical panel of the large rhs's panel (e.g., cols/4 times). // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
// Everything is packed, we can now call the block * panel kernel: // Everything is packed, we can now call the panel * block kernel:
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
} }
} }
} }
@ -203,14 +201,13 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType> template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
struct gemm_functor struct gemm_functor
{ {
gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
BlockingType& blocking)
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
{} {}
void initParallelSession() const void initParallelSession() const
{ {
m_blocking.allocateB(); m_blocking.allocateA();
} }
void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
@ -225,6 +222,8 @@ struct gemm_functor
m_actualAlpha, m_blocking, info); m_actualAlpha, m_blocking, info);
} }
typedef typename Gemm::Traits Traits;
protected: protected:
const Lhs& m_lhs; const Lhs& m_lhs;
const Rhs& m_rhs; const Rhs& m_rhs;
@ -245,7 +244,6 @@ class level3_blocking
protected: protected:
LhsScalar* m_blockA; LhsScalar* m_blockA;
RhsScalar* m_blockB; RhsScalar* m_blockB;
RhsScalar* m_blockW;
DenseIndex m_mc; DenseIndex m_mc;
DenseIndex m_nc; DenseIndex m_nc;
@ -254,7 +252,7 @@ class level3_blocking
public: public:
level3_blocking() level3_blocking()
: m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
{} {}
inline DenseIndex mc() const { return m_mc; } inline DenseIndex mc() const { return m_mc; }
@ -263,7 +261,6 @@ class level3_blocking
inline LhsScalar* blockA() { return m_blockA; } inline LhsScalar* blockA() { return m_blockA; }
inline RhsScalar* blockB() { return m_blockB; } inline RhsScalar* blockB() { return m_blockB; }
inline RhsScalar* blockW() { return m_blockW; }
}; };
template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
@ -282,29 +279,25 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
typedef gebp_traits<LhsScalar,RhsScalar> Traits; typedef gebp_traits<LhsScalar,RhsScalar> Traits;
enum { enum {
SizeA = ActualRows * MaxDepth, SizeA = ActualRows * MaxDepth,
SizeB = ActualCols * MaxDepth, SizeB = ActualCols * MaxDepth
SizeW = MaxDepth * Traits::WorkSpaceFactor
}; };
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
public: public:
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
{ {
this->m_mc = ActualRows; this->m_mc = ActualRows;
this->m_nc = ActualCols; this->m_nc = ActualCols;
this->m_kc = MaxDepth; this->m_kc = MaxDepth;
this->m_blockA = m_staticA; this->m_blockA = m_staticA;
this->m_blockB = m_staticB; this->m_blockB = m_staticB;
this->m_blockW = m_staticW;
} }
inline void allocateA() {} inline void allocateA() {}
inline void allocateB() {} inline void allocateB() {}
inline void allocateW() {}
inline void allocateAll() {} inline void allocateAll() {}
}; };
@ -323,20 +316,28 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
DenseIndex m_sizeA; DenseIndex m_sizeA;
DenseIndex m_sizeB; DenseIndex m_sizeB;
DenseIndex m_sizeW;
public: public:
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth) gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
{ {
this->m_mc = Transpose ? cols : rows; this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols; this->m_nc = Transpose ? rows : cols;
this->m_kc = depth; this->m_kc = depth;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc); if(full_rows)
{
DenseIndex m = this->m_mc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
}
else // full columns
{
DenseIndex n = this->m_nc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
}
m_sizeA = this->m_mc * this->m_kc; m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc; m_sizeB = this->m_kc * this->m_nc;
m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
} }
void allocateA() void allocateA()
@ -351,24 +352,16 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
this->m_blockB = aligned_new<RhsScalar>(m_sizeB); this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
} }
void allocateW()
{
if(this->m_blockW==0)
this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
}
void allocateAll() void allocateAll()
{ {
allocateA(); allocateA();
allocateB(); allocateB();
allocateW();
} }
~gemm_blocking_space() ~gemm_blocking_space()
{ {
aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockA, m_sizeA);
aligned_delete(this->m_blockB, m_sizeB); aligned_delete(this->m_blockB, m_sizeB);
aligned_delete(this->m_blockW, m_sizeW);
} }
}; };
@ -395,6 +388,36 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
} }
template<typename Dest>
inline void evalTo(Dest& dst) const
{
if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
dst.noalias() = m_lhs .lazyProduct( m_rhs );
else
{
dst.setZero();
scaleAndAddTo(dst,Scalar(1));
}
}
template<typename Dest>
inline void addTo(Dest& dst) const
{
if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
dst.noalias() += m_lhs .lazyProduct( m_rhs );
else
scaleAndAddTo(dst,Scalar(1));
}
template<typename Dest>
inline void subTo(Dest& dst) const
{
if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
dst.noalias() -= m_lhs .lazyProduct( m_rhs );
else
scaleAndAddTo(dst,Scalar(-1));
}
template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
{ {
eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
@ -417,7 +440,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
} }

View File

@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
if(mc > Traits::nr) if(mc > Traits::nr)
mc = (mc/Traits::nr)*Traits::nr; mc = (mc/Traits::nr)*Traits::nr;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
std::size_t sizeB = sizeW + kc*size;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
RhsScalar* blockB = allocatedBlockB + sizeW;
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
// 3 - after the diagonal => processed with gebp or skipped // 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower) if (UpLo==Lower)
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-1, -1, 0, 0, allocatedBlockB); -1, -1, 0, 0);
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper) if (UpLo==Upper)
{ {
Index j2 = i2+actual_mc; Index j2 = i2+actual_mc;
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-1, -1, 0, 0, allocatedBlockB); -1, -1, 0, 0);
} }
} }
} }
@ -136,7 +133,7 @@ struct tribb_kernel
enum { enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
}; };
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{ {
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
@ -150,7 +147,7 @@ struct tribb_kernel
if(UpLo==Upper) if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace); -1, -1, 0, 0);
// selfadjoint micro block // selfadjoint micro block
{ {
@ -158,7 +155,7 @@ struct tribb_kernel
buffer.setZero(); buffer.setZero();
// 1 - apply the kernel on the temporary buffer // 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace); -1, -1, 0, 0);
// 2 - triangular accumulation // 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1) for(Index j1=0; j1<actualBlockSize; ++j1)
{ {
@ -173,7 +170,7 @@ struct tribb_kernel
{ {
Index i = j+actualBlockSize; Index i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0, workspace); -1, -1, 0, 0);
} }
} }
} }
@ -278,6 +275,8 @@ template<typename MatrixType, unsigned int UpLo>
template<typename ProductDerived, typename _Lhs, typename _Rhs> template<typename ProductDerived, typename _Lhs, typename _Rhs>
TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha) TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
{ {
eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha); general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
return *this; return *this;

View File

@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignedSize = 0; alignedSize = 0;
alignedStart = 0; alignedStart = 0;
} }
else if(LhsPacketSize > 4)
{
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
// Currently, it seems to be better to perform unaligned loads anyway
alignmentPattern = NoneAligned;
}
else if (LhsPacketSize>1) else if (LhsPacketSize>1)
{ {
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignedSize = 0; alignedSize = 0;
alignedStart = 0; alignedStart = 0;
} }
else if(LhsPacketSize > 4)
{
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
alignmentPattern = NoneAligned;
}
else if (LhsPacketSize>1) else if (LhsPacketSize>1)
{ {
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{ {
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code // this helps the compiler generating good binary code
@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{ {
for (Index i=start; i<end; ++i) for (Index i=start; i<end; ++i)
{ {
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0); ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride; const LhsScalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs // process first unaligned result's coeffs

View File

@ -73,13 +73,13 @@ namespace internal {
template<typename Index> struct GemmParallelInfo template<typename Index> struct GemmParallelInfo
{ {
GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
int volatile sync; int volatile sync;
int volatile users; int volatile users;
Index rhs_start; Index lhs_start;
Index rhs_length; Index lhs_length;
}; };
template<bool Condition, typename Functor, typename Index> template<bool Condition, typename Functor, typename Index>
@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
if((!Condition) || (omp_get_num_threads()>1)) if((!Condition) || (omp_get_num_threads()>1))
return func(0,rows, 0,cols); return func(0,rows, 0,cols);
Index size = transpose ? cols : rows; Index size = transpose ? rows : cols;
// 2- compute the maximal number of threads from the size of the product: // 2- compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned // FIXME this has to be fine tuned
@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
std::swap(rows,cols); std::swap(rows,cols);
Index blockCols = (cols / threads) & ~Index(0x3); Index blockCols = (cols / threads) & ~Index(0x3);
Index blockRows = (rows / threads) & ~Index(0x7); Index blockRows = (rows / threads);
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads]; GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
#pragma omp parallel for schedule(static,1) num_threads(threads) #pragma omp parallel num_threads(threads)
for(Index i=0; i<threads; ++i)
{ {
Index i = omp_get_thread_num();
Index r0 = i*blockRows; Index r0 = i*blockRows;
Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows; Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
Index c0 = i*blockCols; Index c0 = i*blockCols;
Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols; Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
info[i].rhs_start = c0; info[i].lhs_start = r0;
info[i].rhs_length = actualBlockCols; info[i].lhs_length = actualBlockRows;
if(transpose) if(transpose) func(c0, actualBlockCols, 0, rows, info);
func(0, cols, r0, actualBlockRows, info); else func(0, rows, c0, actualBlockCols, info);
else
func(r0, actualBlockRows, 0,cols, info);
} }
delete[] info; delete[] info;

View File

@ -15,7 +15,7 @@ namespace Eigen {
namespace internal { namespace internal {
// pack a selfadjoint block diagonal for use with the gebp_kernel // pack a selfadjoint block diagonal for use with the gebp_kernel
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder> template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
struct symm_pack_lhs struct symm_pack_lhs
{ {
template<int BlockRows> inline template<int BlockRows> inline
@ -45,22 +45,29 @@ struct symm_pack_lhs
} }
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{ {
enum { PacketSize = packet_traits<Scalar>::size };
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0; Index count = 0;
Index peeled_mc = (rows/Pack1)*Pack1; //Index peeled_mc3 = (rows/Pack1)*Pack1;
for(Index i=0; i<peeled_mc; i+=Pack1)
{
pack<Pack1>(blockA, lhs, cols, i, count);
}
if(rows-peeled_mc>=Pack2) const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
{ const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
pack<Pack2>(blockA, lhs, cols, peeled_mc, count); const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
peeled_mc += Pack2;
} if(Pack1>=3*PacketSize)
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
pack<3*PacketSize>(blockA, lhs, cols, i, count);
if(Pack1>=2*PacketSize)
for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
pack<2*PacketSize>(blockA, lhs, cols, i, count);
if(Pack1>=1*PacketSize)
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
pack<1*PacketSize>(blockA, lhs, cols, i, count);
// do the same with mr==1 // do the same with mr==1
for(Index i=peeled_mc; i<rows; i++) for(Index i=peeled_mc1; i<rows; i++)
{ {
for(Index k=0; k<i; k++) for(Index k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal blockA[count++] = lhs(i, k); // normal
@ -82,7 +89,8 @@ struct symm_pack_rhs
Index end_k = k2 + rows; Index end_k = k2 + rows;
Index count = 0; Index count = 0;
const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
Index packet_cols = (cols/nr)*nr; Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
// first part: normal case // first part: normal case
for(Index j2=0; j2<k2; j2+=nr) for(Index j2=0; j2<k2; j2+=nr)
@ -91,17 +99,27 @@ struct symm_pack_rhs
{ {
blockB[count+0] = rhs(k,j2+0); blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1); blockB[count+1] = rhs(k,j2+1);
if (nr==4) if (nr>=4)
{ {
blockB[count+2] = rhs(k,j2+2); blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3); blockB[count+3] = rhs(k,j2+3);
} }
if (nr>=8)
{
blockB[count+4] = rhs(k,j2+4);
blockB[count+5] = rhs(k,j2+5);
blockB[count+6] = rhs(k,j2+6);
blockB[count+7] = rhs(k,j2+7);
}
count += nr; count += nr;
} }
} }
// second part: diagonal block // second part: diagonal block
for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr) Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
if(nr>=8)
{
for(Index j2=k2; j2<end8; j2+=8)
{ {
// again we can split vertically in three different parts (transpose, symmetric, normal) // again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose // transpose
@ -109,16 +127,17 @@ struct symm_pack_rhs
{ {
blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k)); blockB[count+1] = numext::conj(rhs(j2+1,k));
if (nr==4)
{
blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k)); blockB[count+3] = numext::conj(rhs(j2+3,k));
} blockB[count+4] = numext::conj(rhs(j2+4,k));
count += nr; blockB[count+5] = numext::conj(rhs(j2+5,k));
blockB[count+6] = numext::conj(rhs(j2+6,k));
blockB[count+7] = numext::conj(rhs(j2+7,k));
count += 8;
} }
// symmetric // symmetric
Index h = 0; Index h = 0;
for(Index k=j2; k<j2+nr; k++) for(Index k=j2; k<j2+8; k++)
{ {
// normal // normal
for (Index w=0 ; w<h; ++w) for (Index w=0 ; w<h; ++w)
@ -127,43 +146,104 @@ struct symm_pack_rhs
blockB[count+h] = numext::real(rhs(k,k)); blockB[count+h] = numext::real(rhs(k,k));
// transpose // transpose
for (Index w=h+1 ; w<nr; ++w) for (Index w=h+1 ; w<8; ++w)
blockB[count+w] = numext::conj(rhs(j2+w,k)); blockB[count+w] = numext::conj(rhs(j2+w,k));
count += nr; count += 8;
++h; ++h;
} }
// normal // normal
for(Index k=j2+nr; k<end_k; k++) for(Index k=j2+8; k<end_k; k++)
{ {
blockB[count+0] = rhs(k,j2+0); blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1); blockB[count+1] = rhs(k,j2+1);
if (nr==4)
{
blockB[count+2] = rhs(k,j2+2); blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3); blockB[count+3] = rhs(k,j2+3);
blockB[count+4] = rhs(k,j2+4);
blockB[count+5] = rhs(k,j2+5);
blockB[count+6] = rhs(k,j2+6);
blockB[count+7] = rhs(k,j2+7);
count += 8;
}
}
}
if(nr>=4)
{
for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
{
// again we can split vertically in three different parts (transpose, symmetric, normal)
// transpose
for(Index k=k2; k<j2; k++)
{
blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k));
blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k));
count += 4;
}
// symmetric
Index h = 0;
for(Index k=j2; k<j2+4; k++)
{
// normal
for (Index w=0 ; w<h; ++w)
blockB[count+w] = rhs(k,j2+w);
blockB[count+h] = numext::real(rhs(k,k));
// transpose
for (Index w=h+1 ; w<4; ++w)
blockB[count+w] = numext::conj(rhs(j2+w,k));
count += 4;
++h;
}
// normal
for(Index k=j2+4; k<end_k; k++)
{
blockB[count+0] = rhs(k,j2+0);
blockB[count+1] = rhs(k,j2+1);
blockB[count+2] = rhs(k,j2+2);
blockB[count+3] = rhs(k,j2+3);
count += 4;
} }
count += nr;
} }
} }
// third part: transposed // third part: transposed
for(Index j2=k2+rows; j2<packet_cols; j2+=nr) if(nr>=8)
{
for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
{ {
for(Index k=k2; k<end_k; k++) for(Index k=k2; k<end_k; k++)
{ {
blockB[count+0] = numext::conj(rhs(j2+0,k)); blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k)); blockB[count+1] = numext::conj(rhs(j2+1,k));
if (nr==4)
{
blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k)); blockB[count+3] = numext::conj(rhs(j2+3,k));
blockB[count+4] = numext::conj(rhs(j2+4,k));
blockB[count+5] = numext::conj(rhs(j2+5,k));
blockB[count+6] = numext::conj(rhs(j2+6,k));
blockB[count+7] = numext::conj(rhs(j2+7,k));
count += 8;
}
}
}
if(nr>=4)
{
for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
{
for(Index k=k2; k<end_k; k++)
{
blockB[count+0] = numext::conj(rhs(j2+0,k));
blockB[count+1] = numext::conj(rhs(j2+1,k));
blockB[count+2] = numext::conj(rhs(j2+2,k));
blockB[count+3] = numext::conj(rhs(j2+3,k));
count += 4;
} }
count += nr;
} }
} }
// copy the remaining columns one at a time (=> the same with nr==1) // copy the remaining columns one at a time (=> the same with nr==1)
for(Index j2=packet_cols; j2<cols; ++j2) for(Index j2=packet_cols4; j2<cols; ++j2)
{ {
// transpose // transpose
Index half = (std::min)(end_k,j2); Index half = (std::min)(end_k,j2);
@ -261,11 +341,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// kc must smaller than mc // kc must smaller than mc
kc = (std::min)(kc,mc); kc = (std::min)(kc,mc);
std::size_t sizeW = kc*Traits::WorkSpaceFactor; std::size_t sizeB = kc*cols;
std::size_t sizeB = sizeW + kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB + sizeW; Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@ -348,11 +427,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index mc = rows; // cache block size along the M direction Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
std::size_t sizeW = kc*Traits::WorkSpaceFactor; std::size_t sizeB = kc*cols;
std::size_t sizeB = sizeW + kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB + sizeW; Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;

View File

@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset, blockW); actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel // GEBP with remaining micro panel
if (lengthTarget>0) if (lengthTarget>0)
@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset, blockW); actualPanelWidth, actual_kc, 0, blockBOffset);
} }
} }
} }
@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
} }
} }
} }
@ -265,12 +263,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
@ -304,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
Scalar* geb = blockB+ts*ts; Scalar* geb = blockB+ts*ts;
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
@ -357,14 +354,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
actual_mc, panelLength, actualPanelWidth, actual_mc, panelLength, actualPanelWidth,
alpha, alpha,
actual_kc, actual_kc, // strides actual_kc, actual_kc, // strides
blockOffset, blockOffset,// offsets blockOffset, blockOffset);// offsets
blockW); // workspace
} }
} }
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
blockA, geb, actual_mc, actual_kc, rs, blockA, geb, actual_mc, actual_kc, rs,
alpha, alpha,
-1, -1, 0, 0, blockW); -1, -1, 0, 0);
} }
} }
} }

View File

@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \ if (size<(std::max)(rows,cols)) { \
typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
x = x_tmp.data(); \ x = x_tmp.data(); \
if (size<rows) { \ if (size<rows) { \
@ -214,7 +213,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \ if (size<(std::max)(rows,cols)) { \
typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
x = x_tmp.data(); \ x = x_tmp.data(); \
if (size<rows) { \ if (size<rows) { \

View File

@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*cols; std::size_t sizeB = kc*cols;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
conj_if<Conjugate> conj; conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset, blockW); actualPanelWidth, actual_kc, 0, blockBOffset);
} }
} }
} }
@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
{ {
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW); gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
} }
} }
} }
@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
std::size_t sizeA = kc*mc; std::size_t sizeA = kc*mc;
std::size_t sizeB = kc*size; std::size_t sizeB = kc*size;
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
conj_if<Conjugate> conj; conj_if<Conjugate> conj;
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
actual_mc, panelLength, actualPanelWidth, actual_mc, panelLength, actualPanelWidth,
Scalar(-1), Scalar(-1),
actual_kc, actual_kc, // strides actual_kc, actual_kc, // strides
panelOffset, panelOffset, // offsets panelOffset, panelOffset); // offsets
blockW); // workspace
} }
// unblocked triangular solve // unblocked triangular solve
@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
if (rs>0) if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
actual_mc, actual_kc, rs, Scalar(-1), actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0, blockW); -1, -1, 0, 0);
} }
} }
} }

View File

@ -54,8 +54,25 @@
#endif #endif
#if defined EIGEN_USE_MKL #if defined EIGEN_USE_MKL
# include <mkl.h>
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
# ifndef INTEL_MKL_VERSION
# undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
# elif INTEL_MKL_VERSION < 100305 /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
# undef EIGEN_USE_MKL
# endif
# ifndef EIGEN_USE_MKL
/*If the MKL version is too old, undef everything*/
# undef EIGEN_USE_MKL_ALL
# undef EIGEN_USE_BLAS
# undef EIGEN_USE_LAPACKE
# undef EIGEN_USE_MKL_VML
# undef EIGEN_USE_LAPACKE_STRICT
# undef EIGEN_USE_LAPACKE
# endif
#endif
#include <mkl.h> #if defined EIGEN_USE_MKL
#include <mkl_lapacke.h> #include <mkl_lapacke.h>
#define EIGEN_MKL_VML_THRESHOLD 128 #define EIGEN_MKL_VML_THRESHOLD 128

View File

@ -66,13 +66,24 @@
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
#endif #endif
// Defined the boundary (in bytes) on which the data needs to be aligned. Note
// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
// aligned at all regardless of the value of this #define.
#define EIGEN_ALIGN_BYTES 16
#ifdef EIGEN_DONT_ALIGN #ifdef EIGEN_DONT_ALIGN
#ifndef EIGEN_DONT_ALIGN_STATICALLY #ifndef EIGEN_DONT_ALIGN_STATICALLY
#define EIGEN_DONT_ALIGN_STATICALLY #define EIGEN_DONT_ALIGN_STATICALLY
#endif #endif
#define EIGEN_ALIGN 0 #define EIGEN_ALIGN 0
#else #elif !defined(EIGEN_DONT_VECTORIZE)
#if defined(__AVX__)
#undef EIGEN_ALIGN_BYTES
#define EIGEN_ALIGN_BYTES 32
#endif
#define EIGEN_ALIGN 1 #define EIGEN_ALIGN 1
#else
#define EIGEN_ALIGN 0
#endif #endif
// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
@ -286,13 +297,19 @@ namespace Eigen {
#endif #endif
#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
#if EIGEN_ALIGN_STATICALLY #if EIGEN_ALIGN_STATICALLY
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT
#else #else
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
#define EIGEN_USER_ALIGN16 #define EIGEN_USER_ALIGN16
#define EIGEN_USER_ALIGN32
#define EIGEN_USER_ALIGN_DEFAULT
#endif #endif
#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
@ -326,13 +343,13 @@ namespace Eigen {
#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) #elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =; \ using Base::operator =; \
EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
template <typename OtherDerived> \ template <typename OtherDerived> \
EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
#else #else
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =; \ using Base::operator =; \
EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
{ \ { \
Base::operator=(other); \ Base::operator=(other); \
return *this; \ return *this; \

View File

@ -32,7 +32,7 @@
// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
// quite safe, at least within the context of glibc, to equate 64-bit with LP64. // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
&& defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16)
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
#else #else
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@ -42,14 +42,14 @@
// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
// See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) #if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16)
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
#else #else
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
#endif #endif
#if defined(__APPLE__) \ #if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \
|| defined(_WIN64) \ || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \
|| EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
|| EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
#define EIGEN_MALLOC_ALREADY_ALIGNED 1 #define EIGEN_MALLOC_ALREADY_ALIGNED 1
@ -73,7 +73,7 @@
#define EIGEN_HAS_POSIX_MEMALIGN 0 #define EIGEN_HAS_POSIX_MEMALIGN 0
#endif #endif
#ifdef EIGEN_VECTORIZE_SSE #if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
#define EIGEN_HAS_MM_MALLOC 1 #define EIGEN_HAS_MM_MALLOC 1
#else #else
#define EIGEN_HAS_MM_MALLOC 0 #define EIGEN_HAS_MM_MALLOC 0
@ -89,7 +89,7 @@ inline void throw_std_bad_alloc()
#ifdef EIGEN_EXCEPTIONS #ifdef EIGEN_EXCEPTIONS
throw std::bad_alloc(); throw std::bad_alloc();
#else #else
std::size_t huge = -1; std::size_t huge = static_cast<std::size_t>(-1);
new int[huge]; new int[huge];
#endif #endif
} }
@ -105,9 +105,9 @@ inline void throw_std_bad_alloc()
*/ */
inline void* handmade_aligned_malloc(std::size_t size) inline void* handmade_aligned_malloc(std::size_t size)
{ {
void *original = std::malloc(size+16); void *original = std::malloc(size+EIGEN_ALIGN_BYTES);
if (original == 0) return 0; if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16); void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
*(reinterpret_cast<void**>(aligned) - 1) = original; *(reinterpret_cast<void**>(aligned) - 1) = original;
return aligned; return aligned;
} }
@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
if (ptr == 0) return handmade_aligned_malloc(size); if (ptr == 0) return handmade_aligned_malloc(size);
void *original = *(reinterpret_cast<void**>(ptr) - 1); void *original = *(reinterpret_cast<void**>(ptr) - 1);
std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original); std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
original = std::realloc(original,size+16); original = std::realloc(original,size+EIGEN_ALIGN_BYTES);
if (original == 0) return 0; if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16); void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
void *previous_aligned = static_cast<char *>(original)+previous_offset; void *previous_aligned = static_cast<char *>(original)+previous_offset;
if(aligned!=previous_aligned) if(aligned!=previous_aligned)
std::memmove(aligned, previous_aligned, size); std::memmove(aligned, previous_aligned, size);
@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed()
{} {}
#endif #endif
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment. /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/ */
inline void* aligned_malloc(size_t size) inline void* aligned_malloc(size_t size)
@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size)
#elif EIGEN_MALLOC_ALREADY_ALIGNED #elif EIGEN_MALLOC_ALREADY_ALIGNED
result = std::malloc(size); result = std::malloc(size);
#elif EIGEN_HAS_POSIX_MEMALIGN #elif EIGEN_HAS_POSIX_MEMALIGN
if(posix_memalign(&result, 16, size)) result = 0; if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0;
#elif EIGEN_HAS_MM_MALLOC #elif EIGEN_HAS_MM_MALLOC
result = _mm_malloc(size, 16); result = _mm_malloc(size, EIGEN_ALIGN_BYTES);
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
result = _aligned_malloc(size, 16); result = _aligned_malloc(size, EIGEN_ALIGN_BYTES);
#else #else
result = handmade_aligned_malloc(size); result = handmade_aligned_malloc(size);
#endif #endif
@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
// implements _mm_malloc/_mm_free based on the corresponding _aligned_ // implements _mm_malloc/_mm_free based on the corresponding _aligned_
// functions. This may not always be the case and we just try to be safe. // functions. This may not always be the case and we just try to be safe.
#if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free) #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
result = _aligned_realloc(ptr,new_size,16); result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
#else #else
result = generic_aligned_realloc(ptr,new_size,old_size); result = generic_aligned_realloc(ptr,new_size,old_size);
#endif #endif
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
result = _aligned_realloc(ptr,new_size,16); result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
#else #else
result = handmade_aligned_realloc(ptr,new_size,old_size); result = handmade_aligned_realloc(ptr,new_size,old_size);
#endif #endif
@ -552,7 +552,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
// to the appropriate stack allocation function // to the appropriate stack allocation function
#ifndef EIGEN_ALLOCA #ifndef EIGEN_ALLOCA
#if (defined __linux__) #if (defined __linux__) || (defined __APPLE__)
#define EIGEN_ALLOCA alloca #define EIGEN_ALLOCA alloca
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#define EIGEN_ALLOCA _alloca #define EIGEN_ALLOCA _alloca
@ -607,9 +607,9 @@ template<typename T> class aligned_stack_memory_handler
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
*/ */
#ifdef EIGEN_ALLOCA #ifdef EIGEN_ALLOCA
// The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled.
#if defined(__arm__) || defined(_WIN32) #if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16
#define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16) #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
#else #else
#define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
#endif #endif
@ -679,7 +679,7 @@ template<typename T> class aligned_stack_memory_handler
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))) EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0)))
/****************************************************************************/ /****************************************************************************/
@ -700,7 +700,7 @@ template<typename T> class aligned_stack_memory_handler
* \sa \ref TopicStlContainers. * \sa \ref TopicStlContainers.
*/ */
template<class T> template<class T>
class aligned_allocator class aligned_allocator : public std::allocator<T>
{ {
public: public:
typedef size_t size_type; typedef size_t size_type;
@ -717,81 +717,25 @@ public:
typedef aligned_allocator<U> other; typedef aligned_allocator<U> other;
}; };
pointer address( reference value ) const aligned_allocator() : std::allocator<T>() {}
{
return &value;
}
const_pointer address( const_reference value ) const aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
{
return &value;
}
aligned_allocator()
{
}
aligned_allocator( const aligned_allocator& )
{
}
template<class U> template<class U>
aligned_allocator( const aligned_allocator<U>& ) aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
{
}
~aligned_allocator() ~aligned_allocator() {}
{
}
size_type max_size() const pointer allocate(size_type num, const void* /*hint*/ = 0)
{ {
return (std::numeric_limits<size_type>::max)();
}
pointer allocate( size_type num, const void* hint = 0 )
{
EIGEN_UNUSED_VARIABLE(hint);
internal::check_size_for_overflow<T>(num); internal::check_size_for_overflow<T>(num);
return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) ); return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
} }
void construct( pointer p, const T& value ) void deallocate(pointer p, size_type /*num*/)
{ {
::new( p ) T( value ); internal::aligned_free(p);
} }
#if (__cplusplus >= 201103L)
template <typename U, typename... Args>
void construct( U* u, Args&&... args)
{
::new( static_cast<void*>(u) ) U( std::forward<Args>( args )... );
}
#endif
void destroy( pointer p )
{
p->~T();
}
#if (__cplusplus >= 201103L)
template <typename U>
void destroy( U* u )
{
u->~U();
}
#endif
void deallocate( pointer p, size_type /*num*/ )
{
internal::aligned_free( p );
}
bool operator!=(const aligned_allocator<T>& ) const
{ return false; }
bool operator==(const aligned_allocator<T>& ) const
{ return true; }
}; };
//---------- Cache sizes ---------- //---------- Cache sizes ----------
@ -823,9 +767,9 @@ namespace internal {
#ifdef EIGEN_CPUID #ifdef EIGEN_CPUID
inline bool cpuid_is_vendor(int abcd[4], const char* vendor) inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
{ {
return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2]; return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
} }
inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
@ -967,13 +911,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3)
{ {
#ifdef EIGEN_CPUID #ifdef EIGEN_CPUID
int abcd[4]; int abcd[4];
const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
// identify the CPU vendor // identify the CPU vendor
EIGEN_CPUID(abcd,0x0,0); EIGEN_CPUID(abcd,0x0,0);
int max_std_funcs = abcd[1]; int max_std_funcs = abcd[1];
if(cpuid_is_vendor(abcd,"GenuineIntel")) if(cpuid_is_vendor(abcd,GenuineIntel))
queryCacheSizes_intel(l1,l2,l3,max_std_funcs); queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!")) else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
queryCacheSizes_amd(l1,l2,l3); queryCacheSizes_amd(l1,l2,l3);
else else
// by default let's use Intel's API // by default let's use Intel's API

View File

@ -101,6 +101,7 @@ template<typename T> struct packet_traits;
template<typename T> struct unpacket_traits template<typename T> struct unpacket_traits
{ {
typedef T type; typedef T type;
typedef T half;
enum {size=1}; enum {size=1};
}; };
@ -137,7 +138,7 @@ class compute_matrix_flags
((Options&DontAlign)==0) ((Options&DontAlign)==0)
&& ( && (
#if EIGEN_ALIGN_STATICALLY #if EIGEN_ALIGN_STATICALLY
((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0)) ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
#else #else
0 0
#endif #endif

View File

@ -275,10 +275,11 @@ template<typename _MatrixType> class EigenSolver
*/ */
EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true); EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
/** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
ComputationInfo info() const ComputationInfo info() const
{ {
eigen_assert(m_isInitialized && "EigenSolver is not initialized."); eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
return m_realSchur.info(); return m_info;
} }
/** \brief Sets the maximum number of iterations allowed. */ /** \brief Sets the maximum number of iterations allowed. */
@ -302,6 +303,7 @@ template<typename _MatrixType> class EigenSolver
EigenvalueType m_eivalues; EigenvalueType m_eivalues;
bool m_isInitialized; bool m_isInitialized;
bool m_eigenvectorsOk; bool m_eigenvectorsOk;
ComputationInfo m_info;
RealSchur<MatrixType> m_realSchur; RealSchur<MatrixType> m_realSchur;
MatrixType m_matT; MatrixType m_matT;
@ -366,12 +368,16 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
{ {
using std::sqrt; using std::sqrt;
using std::abs; using std::abs;
using std::max;
using numext::isfinite;
eigen_assert(matrix.cols() == matrix.rows()); eigen_assert(matrix.cols() == matrix.rows());
// Reduce to real Schur form. // Reduce to real Schur form.
m_realSchur.compute(matrix, computeEigenvectors); m_realSchur.compute(matrix, computeEigenvectors);
if (m_realSchur.info() == Success) m_info = m_realSchur.info();
if (m_info == Success)
{ {
m_matT = m_realSchur.matrixT(); m_matT = m_realSchur.matrixT();
if (computeEigenvectors) if (computeEigenvectors)
@ -385,14 +391,40 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0))
{ {
m_eivalues.coeffRef(i) = m_matT.coeff(i, i); m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
if(!isfinite(m_eivalues.coeffRef(i)))
{
m_isInitialized = true;
m_eigenvectorsOk = false;
m_info = NumericalIssue;
return *this;
}
++i; ++i;
} }
else else
{ {
Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1)); Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1))); Scalar z;
// Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
// without overflow
{
Scalar t0 = m_matT.coeff(i+1, i);
Scalar t1 = m_matT.coeff(i, i+1);
Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1)));
t0 /= maxval;
t1 /= maxval;
Scalar p0 = p/maxval;
z = maxval * sqrt(abs(p0 * p0 + t0 * t1));
}
m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z); m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z); m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1))))
{
m_isInitialized = true;
m_eigenvectorsOk = false;
m_info = NumericalIssue;
return *this;
}
i += 2; i += 2;
} }
} }
@ -581,7 +613,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
} }
else else
{ {
eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen
} }
} }

View File

@ -77,7 +77,9 @@ public:
* represents an invalid rotation. */ * represents an invalid rotation. */
template<typename Derived> template<typename Derived>
inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {} inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
/** Constructs and initialize the angle-axis rotation from a quaternion \a q. */ /** Constructs and initialize the angle-axis rotation from a quaternion \a q.
* This function implicitly normalizes the quaternion \a q.
*/
template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; } template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
/** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */ /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
template<typename Derived> template<typename Derived>
@ -149,29 +151,27 @@ typedef AngleAxis<float> AngleAxisf;
typedef AngleAxis<double> AngleAxisd; typedef AngleAxis<double> AngleAxisd;
/** Set \c *this from a \b unit quaternion. /** Set \c *this from a \b unit quaternion.
* The axis is normalized. * The resulting axis is normalized.
* *
* \warning As any other method dealing with quaternion, if the input quaternion * This function implicitly normalizes the quaternion \a q.
* is not normalized then the result is undefined.
*/ */
template<typename Scalar> template<typename Scalar>
template<typename QuatDerived> template<typename QuatDerived>
AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q) AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
{ {
using std::acos; using std::atan2;
EIGEN_USING_STD_MATH(min); Scalar n = q.vec().norm();
EIGEN_USING_STD_MATH(max); if(n<NumTraits<Scalar>::epsilon())
using std::sqrt; n = q.vec().stableNorm();
Scalar n2 = q.vec().squaredNorm(); if (n > Scalar(0))
if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
{ {
m_angle = 0; m_angle = Scalar(2)*atan2(n, q.w());
m_axis << 1, 0, 0; m_axis = q.vec() / n;
} }
else else
{ {
m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1))); m_angle = 0;
m_axis = q.vec() / sqrt(n2); m_axis << 1, 0, 0;
} }
return *this; return *this;
} }

View File

@ -587,7 +587,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
// which yields a singular value problem // which yields a singular value problem
if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision()) if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
{ {
c = max<Scalar>(c,-1); c = (max)(c,Scalar(-1));
Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose(); Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV); JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
Vector3 axis = svd.matrixV().col(2); Vector3 axis = svd.matrixV().col(2);

View File

@ -194,9 +194,9 @@ public:
/** type of the matrix used to represent the linear part of the transformation */ /** type of the matrix used to represent the linear part of the transformation */
typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType; typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
/** type of read/write reference to the linear part of the transformation */ /** type of read/write reference to the linear part of the transformation */
typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact)> LinearPart; typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
/** type of read reference to the linear part of the transformation */ /** type of read reference to the linear part of the transformation */
typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact)> ConstLinearPart; typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
/** type of read/write reference to the affine part of the transformation */ /** type of read/write reference to the affine part of the transformation */
typedef typename internal::conditional<int(Mode)==int(AffineCompact), typedef typename internal::conditional<int(Mode)==int(AffineCompact),
MatrixType&, MatrixType&,

View File

@ -113,7 +113,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
const Index n = src.cols(); // number of measurements const Index n = src.cols(); // number of measurements
// required for demeaning ... // required for demeaning ...
const RealScalar one_over_n = 1 / static_cast<RealScalar>(n); const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n);
// computation of mean // computation of mean
const VectorType src_mean = src.rowwise().sum() * one_over_n; const VectorType src_mean = src.rowwise().sum() * one_over_n;
@ -136,16 +136,16 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
// Eq. (39) // Eq. (39)
VectorType S = VectorType::Ones(m); VectorType S = VectorType::Ones(m);
if (sigma.determinant()<0) S(m-1) = -1; if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
// Eq. (40) and (43) // Eq. (40) and (43)
const VectorType& d = svd.singularValues(); const VectorType& d = svd.singularValues();
Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank; Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
if (rank == m-1) { if (rank == m-1) {
if ( svd.matrixU().determinant() * svd.matrixV().determinant() > 0 ) { if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose(); Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
} else { } else {
const Scalar s = S(m-1); S(m-1) = -1; const Scalar s = S(m-1); S(m-1) = Scalar(-1);
Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
S(m-1) = s; S(m-1) = s;
} }
@ -156,7 +156,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
if (with_scaling) if (with_scaling)
{ {
// Eq. (42) // Eq. (42)
const Scalar c = 1/src_var * svd.singularValues().dot(S); const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
// Eq. (41) // Eq. (41)
Rt.col(m).head(m) = dst_mean; Rt.col(m).head(m) = dst_mean;

View File

@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
typedef typename MatrixType::Index Index; typedef typename MatrixType::Index Index;
enum { TFactorSize = MatrixType::ColsAtCompileTime }; enum { TFactorSize = MatrixType::ColsAtCompileTime };
Index nbVecs = vectors.cols(); Index nbVecs = vectors.cols();
Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize> T(nbVecs,nbVecs); Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
make_block_householder_triangular_factor(T, vectors, hCoeffs); make_block_householder_triangular_factor(T, vectors, hCoeffs);
const TriangularView<const VectorsType, UnitLower>& V(vectors); const TriangularView<const VectorsType, UnitLower>& V(vectors);

View File

@ -65,10 +65,10 @@ class DiagonalPreconditioner
{ {
typename MatType::InnerIterator it(mat,j); typename MatType::InnerIterator it(mat,j);
while(it && it.index()!=j) ++it; while(it && it.index()!=j) ++it;
if(it && it.index()==j) if(it && it.index()==j && it.value()!=Scalar(0))
m_invdiag(j) = Scalar(1)/it.value(); m_invdiag(j) = Scalar(1)/it.value();
else else
m_invdiag(j) = 0; m_invdiag(j) = Scalar(1);
} }
m_isInitialized = true; m_isInitialized = true;
return *this; return *this;

View File

@ -29,10 +29,11 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
* *
* \param MatrixType the type of the matrix of which we are computing the LU decomposition * \param MatrixType the type of the matrix of which we are computing the LU decomposition
* *
* This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
* is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
* are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
* coefficients) of U are sorted in such a way that any zeros are at the end. * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
* zeros are at the end.
* *
* This decomposition provides the generic approach to solving systems of linear equations, computing * This decomposition provides the generic approach to solving systems of linear equations, computing
* the rank, invertibility, inverse, kernel, and determinant. * the rank, invertibility, inverse, kernel, and determinant.
@ -546,8 +547,8 @@ typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant
} }
/** \returns the matrix represented by the decomposition, /** \returns the matrix represented by the decomposition,
* i.e., it returns the product: P^{-1} L U Q^{-1}. * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$.
* This function is provided for debug purpose. */ * This function is provided for debug purposes. */
template<typename MatrixType> template<typename MatrixType>
MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
{ {

View File

@ -109,7 +109,7 @@ class NaturalOrdering
* \class COLAMDOrdering * \class COLAMDOrdering
* *
* Functor computing the \em column \em approximate \em minimum \em degree ordering * Functor computing the \em column \em approximate \em minimum \em degree ordering
* The matrix should be in column-major format * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
*/ */
template<typename Index> template<typename Index>
class COLAMDOrdering class COLAMDOrdering
@ -118,10 +118,14 @@ class COLAMDOrdering
typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
typedef Matrix<Index, Dynamic, 1> IndexVector; typedef Matrix<Index, Dynamic, 1> IndexVector;
/** Compute the permutation vector form a sparse matrix */ /** Compute the permutation vector \a perm form the sparse matrix \a mat
* \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
*/
template <typename MatrixType> template <typename MatrixType>
void operator() (const MatrixType& mat, PermutationType& perm) void operator() (const MatrixType& mat, PermutationType& perm)
{ {
eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
Index m = mat.rows(); Index m = mat.rows();
Index n = mat.cols(); Index n = mat.cols();
Index nnz = mat.nonZeros(); Index nnz = mat.nonZeros();

View File

@ -415,6 +415,7 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
JacobiRotation<RealScalar> *j_right) JacobiRotation<RealScalar> *j_right)
{ {
using std::sqrt; using std::sqrt;
using std::abs;
Matrix<RealScalar,2,2> m; Matrix<RealScalar,2,2> m;
m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)), m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q)); numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
@ -428,9 +429,11 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
} }
else else
{ {
RealScalar u = d / t; RealScalar t2d2 = numext::hypot(t,d);
rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u)); rot1.c() = abs(t)/t2d2;
rot1.s() = rot1.c() * u; rot1.s() = d/t2d2;
if(t<RealScalar(0))
rot1.s() = -rot1.s();
} }
m.applyOnTheLeft(0,1,rot1); m.applyOnTheLeft(0,1,rot1);
j_right->makeJacobi(m,0,1); j_right->makeJacobi(m,0,1);

View File

@ -338,7 +338,10 @@ const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::inner
namespace internal { namespace internal {
template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel, template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) || (BlockRows==1 && !XprType::IsRowMajor)> bool OuterVector = (BlockCols==1 && XprType::IsRowMajor)
| // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
// revert to || as soon as not needed anymore.
(BlockRows==1 && !XprType::IsRowMajor)>
class GenericSparseBlockInnerIteratorImpl; class GenericSparseBlockInnerIteratorImpl;
} }

View File

@ -1205,7 +1205,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
m_data.value(p) = m_data.value(p-1); m_data.value(p) = m_data.value(p-1);
--p; --p;
} }
eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end"); eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
m_innerNonZeros[outer]++; m_innerNonZeros[outer]++;

View File

@ -58,6 +58,7 @@ namespace internal {
* \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module
* OrderingMethods \endlink module for the list of built-in and external ordering methods. * OrderingMethods \endlink module for the list of built-in and external ordering methods.
* *
* \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
* *
*/ */
template<typename _MatrixType, typename _OrderingType> template<typename _MatrixType, typename _OrderingType>
@ -77,10 +78,23 @@ class SparseQR
SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
{ } { }
/** Construct a QR factorization of the matrix \a mat.
*
* \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
*
* \sa compute()
*/
SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false) SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
{ {
compute(mat); compute(mat);
} }
/** Computes the QR factorization of the sparse matrix \a mat.
*
* \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
*
* \sa analyzePattern(), factorize()
*/
void compute(const MatrixType& mat) void compute(const MatrixType& mat)
{ {
analyzePattern(mat); analyzePattern(mat);
@ -255,6 +269,8 @@ class SparseQR
}; };
/** \brief Preprocessing step of a QR factorization /** \brief Preprocessing step of a QR factorization
*
* \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
* *
* In this step, the fill-reducing permutation is computed and applied to the columns of A * In this step, the fill-reducing permutation is computed and applied to the columns of A
* and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited. * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
@ -264,6 +280,7 @@ class SparseQR
template <typename MatrixType, typename OrderingType> template <typename MatrixType, typename OrderingType>
void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat) void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
{ {
eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
// Compute the column fill reducing ordering // Compute the column fill reducing ordering
OrderingType ord; OrderingType ord;
ord(mat, m_perm_c); ord(mat, m_perm_c);

View File

@ -11,7 +11,7 @@
#ifndef EIGEN_STDDEQUE_H #ifndef EIGEN_STDDEQUE_H
#define EIGEN_STDDEQUE_H #define EIGEN_STDDEQUE_H
#include "Eigen/src/StlSupport/details.h" #include "details.h"
// Define the explicit instantiation (e.g. necessary for the Intel compiler) // Define the explicit instantiation (e.g. necessary for the Intel compiler)
#if defined(__INTEL_COMPILER) || defined(__GNUC__) #if defined(__INTEL_COMPILER) || defined(__GNUC__)

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_STDLIST_H #ifndef EIGEN_STDLIST_H
#define EIGEN_STDLIST_H #define EIGEN_STDLIST_H
#include "Eigen/src/StlSupport/details.h" #include "details.h"
// Define the explicit instantiation (e.g. necessary for the Intel compiler) // Define the explicit instantiation (e.g. necessary for the Intel compiler)
#if defined(__INTEL_COMPILER) || defined(__GNUC__) #if defined(__INTEL_COMPILER) || defined(__GNUC__)

View File

@ -11,7 +11,7 @@
#ifndef EIGEN_STDVECTOR_H #ifndef EIGEN_STDVECTOR_H
#define EIGEN_STDVECTOR_H #define EIGEN_STDVECTOR_H
#include "Eigen/src/StlSupport/details.h" #include "details.h"
/** /**
* This section contains a convenience MACRO which allows an easy specialization of * This section contains a convenience MACRO which allows an easy specialization of

View File

@ -141,6 +141,18 @@ tan() const
return derived(); return derived();
} }
/** \returns an expression of the coefficient-wise arc tan of *this.
*
* Example: \include Cwise_atan.cpp
* Output: \verbinclude Cwise_atan.out
*
* \sa cos(), sin(), tan()
*/
inline const CwiseUnaryOp<internal::scalar_atan_op<Scalar>, Derived>
atan() const
{
return derived();
}
/** \returns an expression of the coefficient-wise power of *this to the given exponent. /** \returns an expression of the coefficient-wise power of *this to the given exponent.
* *

3
README.md Normal file
View File

@ -0,0 +1,3 @@
**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
For more information go to http://eigen.tuxfamily.org/.

View File

@ -2,6 +2,14 @@
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
// Compilation options:
//
// -DSCALAR=std::complex<double>
// -DSCALARA=double or -DSCALARB=double
// -DHAVE_BLAS
// -DDECOUPLED
//
#include <iostream> #include <iostream>
#include <Eigen/Core> #include <Eigen/Core>
#include <bench/BenchTimer.h> #include <bench/BenchTimer.h>
@ -14,10 +22,18 @@ using namespace Eigen;
#define SCALAR float #define SCALAR float
#endif #endif
#ifndef SCALARA
#define SCALARA SCALAR
#endif
#ifndef SCALARB
#define SCALARB SCALAR
#endif
typedef SCALAR Scalar; typedef SCALAR Scalar;
typedef NumTraits<Scalar>::Real RealScalar; typedef NumTraits<Scalar>::Real RealScalar;
typedef Matrix<RealScalar,Dynamic,Dynamic> A; typedef Matrix<SCALARA,Dynamic,Dynamic> A;
typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B; typedef Matrix<SCALARB,Dynamic,Dynamic> B;
typedef Matrix<Scalar,Dynamic,Dynamic> C; typedef Matrix<Scalar,Dynamic,Dynamic> C;
typedef Matrix<RealScalar,Dynamic,Dynamic> M; typedef Matrix<RealScalar,Dynamic,Dynamic> M;
@ -129,35 +145,62 @@ int main(int argc, char ** argv)
int tries = 2; // number of tries, we keep the best int tries = 2; // number of tries, we keep the best
int s = 2048; int s = 2048;
int m = s;
int n = s;
int p = s;
int cache_size = -1; int cache_size = -1;
bool need_help = false; bool need_help = false;
for (int i=1; i<argc; ++i) for (int i=1; i<argc;)
{ {
if(argv[i][0]=='s') if(argv[i][0]=='-')
s = atoi(argv[i]+1); {
else if(argv[i][0]=='c') if(argv[i][1]=='s')
cache_size = atoi(argv[i]+1); {
else if(argv[i][0]=='t') ++i;
tries = atoi(argv[i]+1); s = atoi(argv[i++]);
else if(argv[i][0]=='p') m = n = p = s;
rep = atoi(argv[i]+1); if(argv[i][0]!='-')
{
n = atoi(argv[i++]);
p = atoi(argv[i++]);
}
}
else if(argv[i][1]=='c')
{
++i;
cache_size = atoi(argv[i++]);
}
else if(argv[i][1]=='t')
{
++i;
tries = atoi(argv[i++]);
}
else if(argv[i][1]=='p')
{
++i;
rep = atoi(argv[i++]);
}
}
else else
{
need_help = true; need_help = true;
break;
}
} }
if(need_help) if(need_help)
{ {
std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n"; std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n";
std::cout << " <matrix sizes> : size\n";
std::cout << " <matrix sizes> : rows columns depth\n";
return 1; return 1;
} }
if(cache_size>0) if(cache_size>0)
setCpuCacheSizes(cache_size,96*cache_size); setCpuCacheSizes(cache_size,96*cache_size);
int m = s;
int n = s;
int p = s;
A a(m,p); a.setRandom(); A a(m,p); a.setRandom();
B b(p,n); b.setRandom(); B b(p,n); b.setRandom();
C c(m,n); c.setOnes(); C c(m,n); c.setOnes();
@ -172,6 +215,7 @@ int main(int argc, char ** argv)
// check the parallel product is correct // check the parallel product is correct
#if defined EIGEN_HAS_OPENMP #if defined EIGEN_HAS_OPENMP
Eigen::initParallel();
int procs = omp_get_max_threads(); int procs = omp_get_max_threads();
if(procs>1) if(procs>1)
{ {
@ -188,11 +232,20 @@ int main(int argc, char ** argv)
#elif defined HAVE_BLAS #elif defined HAVE_BLAS
blas_gemm(a,b,r); blas_gemm(a,b,r);
c.noalias() += a * b; c.noalias() += a * b;
if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; if(!r.isApprox(c)) {
std::cout << r - c << "\n";
std::cerr << "Warning, your product is crap!\n\n";
}
#else #else
if(1.*m*n*p<2000.*2000*2000)
{
gemm(a,b,c); gemm(a,b,c);
r.noalias() += a.cast<Scalar>() * b.cast<Scalar>(); r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n"; if(!r.isApprox(c)) {
std::cout << r - c << "\n";
std::cerr << "Warning, your product is crap!\n\n";
}
}
#endif #endif
#ifdef HAVE_BLAS #ifdef HAVE_BLAS
@ -214,7 +267,7 @@ int main(int argc, char ** argv)
{ {
BenchTimer tmono; BenchTimer tmono;
omp_set_num_threads(1); omp_set_num_threads(1);
Eigen::internal::setNbThreads(1); Eigen::setNbThreads(1);
c = rc; c = rc;
BENCH(tmono, tries, rep, gemm(a,b,c)); BENCH(tmono, tries, rep, gemm(a,b,c));
std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n"; std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
@ -223,6 +276,15 @@ int main(int argc, char ** argv)
} }
#endif #endif
if(1.*m*n*p<30*30*30)
{
BenchTimer tmt;
c = rc;
BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
}
#ifdef DECOUPLED #ifdef DECOUPLED
if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex)) if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
{ {

View File

@ -104,6 +104,7 @@ add_subdirectory(libs/mtl4)
add_subdirectory(libs/blitz) add_subdirectory(libs/blitz)
add_subdirectory(libs/tvmet) add_subdirectory(libs/tvmet)
add_subdirectory(libs/STL) add_subdirectory(libs/STL)
add_subdirectory(libs/blaze)
add_subdirectory(data) add_subdirectory(data)

View File

@ -33,7 +33,7 @@ class Action_axpby {
public : public :
// Ctor // Ctor
Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95) Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size)
{ {
MESSAGE("Action_axpby Ctor"); MESSAGE("Action_axpby Ctor");

View File

@ -35,7 +35,7 @@ public :
// Ctor // Ctor
Action_axpy( int size ):_size(size),_coef(1.0) Action_axpy( int size ):_coef(1.0),_size(size)
{ {
MESSAGE("Action_axpy Ctor"); MESSAGE("Action_axpy Ctor");

View File

@ -4,10 +4,7 @@ if (ATLAS_LIBRARIES)
endif (ATLAS_LIBRARIES) endif (ATLAS_LIBRARIES)
find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
@ -22,14 +19,14 @@ find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS} ${ATLAS_F77BLAS} ${ATLAS_LIB}) set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_LIB})
# search the default lapack lib link to it # search the default lapack lib link to it
find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64) find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64)
find_library(ATLAS_REFERENCE_LAPACK NAMES lapack) find_library(ATLAS_REFERENCE_LAPACK NAMES lapack)
if(ATLAS_REFERENCE_LAPACK) # if(ATLAS_REFERENCE_LAPACK)
set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK}) # set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
endif() # endif()
endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)

View File

@ -0,0 +1,31 @@
# - Try to find eigen2 headers
# Once done this will define
#
# BLAZE_FOUND - system has blaze lib
# BLAZE_INCLUDE_DIR - the blaze include directory
#
# Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
# Adapted from FindEigen.cmake:
# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
# Redistribution and use is allowed according to the terms of the BSD license.
# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
if (BLAZE_INCLUDE_DIR)
# in cache already
set(BLAZE_FOUND TRUE)
else (BLAZE_INCLUDE_DIR)
find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h
PATHS
${INCLUDE_INSTALL_DIR}
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR)
mark_as_advanced(BLAZE_INCLUDE_DIR)
endif(BLAZE_INCLUDE_DIR)

View File

@ -1,15 +0,0 @@
if (GOTO_LIBRARIES)
set(GOTO_FIND_QUIETLY TRUE)
endif (GOTO_LIBRARIES)
find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR})
if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES)
mark_as_advanced(GOTO_LIBRARIES)

View File

@ -1,25 +0,0 @@
if (GOTO2_LIBRARIES)
set(GOTO2_FIND_QUIETLY TRUE)
endif (GOTO2_LIBRARIES)
#
# find_path(GOTO_INCLUDES
# NAMES
# cblas.h
# PATHS
# $ENV{GOTODIR}/include
# ${INCLUDE_INSTALL_DIR}
# )
find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GOTO2 DEFAULT_MSG
GOTO2_LIBRARIES)
mark_as_advanced(GOTO2_LIBRARIES)

View File

@ -0,0 +1,17 @@
if (OPENBLAS_LIBRARIES)
set(OPENBLAS_FIND_QUIETLY TRUE)
endif (OPENBLAS_LIBRARIES)
find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(OPENBLAS DEFAULT_MSG
OPENBLAS_LIBRARIES)
mark_as_advanced(OPENBLAS_LIBRARIES)

View File

@ -1,19 +1,19 @@
aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000 aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000
ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000 ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000
atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000 atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000
axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000 axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000
axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000 axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000
matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000 matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000
matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000 matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000
trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000 trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000
trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000 trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000
trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000 trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000
cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000 cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000
complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000 complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000
partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000 partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000
tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000 tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000
hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000 hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000
symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000 symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000
syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000 syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000
ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000 ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000
rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000 rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000

View File

@ -10,7 +10,7 @@ ublas ; with lines lw 3 lt 1 lc rgbcolor "#00b7ff"
mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847" mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847"
blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff" blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c" F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c"
GOTO ; with lines lw 3 lt 3 lc rgbcolor "#C05600" OPENBLAS ; with lines lw 3 lt 1 lc rgbcolor "#C05600"
GOTO2 ; with lines lw 3 lt 1 lc rgbcolor "#C05600"
C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96" C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96"
ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c" ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c"
blaze ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff"

View File

@ -33,7 +33,7 @@
// min matrix size for matrix vector product bench // min matrix size for matrix vector product bench
#define MIN_MV 5 #define MIN_MV 5
// max matrix size for matrix vector product bench // max matrix size for matrix vector product bench
#define MAX_MV 3000 #define MAX_MV 5000
// min matrix size for matrix matrix product bench // min matrix size for matrix matrix product bench
#define MIN_MM 5 #define MIN_MM 5
// max matrix size for matrix matrix product bench // max matrix size for matrix matrix product bench

View File

@ -176,7 +176,7 @@ public:
if (_config!=NULL) if (_config!=NULL)
{ {
std::vector<BtlString> config = BtlString(_config).split(" \t\n"); std::vector<BtlString> config = BtlString(_config).split(" \t\n");
for (int i = 0; i<config.size(); i++) for (unsigned int i = 0; i<config.size(); i++)
{ {
if (config[i].beginsWith("-a")) if (config[i].beginsWith("-a"))
{ {
@ -224,7 +224,7 @@ public:
return false; return false;
BtlString name(_name); BtlString name(_name);
for (int i=0; i<Instance.m_selectedActionNames.size(); ++i) for (unsigned int i=0; i<Instance.m_selectedActionNames.size(); ++i)
if (name.contains(Instance.m_selectedActionNames[i])) if (name.contains(Instance.m_selectedActionNames[i]))
return false; return false;

View File

@ -30,23 +30,23 @@ double simple_function(int index_i, int index_j)
return index_i+index_j; return index_i+index_j;
} }
double pseudo_random(int index) double pseudo_random(int /*index*/)
{ {
return std::rand()/double(RAND_MAX); return std::rand()/double(RAND_MAX);
} }
double pseudo_random(int index_i, int index_j) double pseudo_random(int /*index_i*/, int /*index_j*/)
{ {
return std::rand()/double(RAND_MAX); return std::rand()/double(RAND_MAX);
} }
double null_function(int index) double null_function(int /*index*/)
{ {
return 0.0; return 0.0;
} }
double null_function(int index_i, int index_j) double null_function(int /*index_i*/, int /*index_j*/)
{ {
return 0.0; return 0.0;
} }

View File

@ -23,7 +23,7 @@
#include "size_log.hh" #include "size_log.hh"
template<class Vector> template<class Vector>
void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X) void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X)
{ {
int ten=10; int ten=10;
int nine=9; int nine=9;

View File

@ -18,27 +18,14 @@ if (MKL_FOUND)
endif (MKL_FOUND) endif (MKL_FOUND)
find_package(GOTO2) find_package(OPENBLAS)
if (GOTO2_FOUND) if (OPENBLAS_FOUND)
btl_add_bench(btl_goto2 main.cpp) btl_add_bench(btl_openblas main.cpp)
if(BUILD_btl_goto2) if(BUILD_btl_openblas)
target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} ) target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} )
set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2") set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS")
endif(BUILD_btl_goto2) endif(BUILD_btl_openblas)
endif (GOTO2_FOUND) endif (OPENBLAS_FOUND)
find_package(GOTO)
if (GOTO_FOUND)
if(GOTO2_FOUND)
btl_add_bench(btl_goto main.cpp OFF)
else()
btl_add_bench(btl_goto main.cpp)
endif()
if(BUILD_btl_goto)
target_link_libraries(btl_goto ${GOTO_LIBRARIES} )
set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO")
endif(BUILD_btl_goto)
endif (GOTO_FOUND)
find_package(ACML) find_package(ACML)
if (ACML_FOUND) if (ACML_FOUND)

View File

@ -75,7 +75,6 @@ public :
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
int N2 = N*N; int N2 = N*N;
BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
char uplo = 'L';
int info = 0; int info = 0;
int * ipiv = (int*)alloca(sizeof(int)*N); int * ipiv = (int*)alloca(sizeof(int)*N);
BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info); BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info);
@ -92,7 +91,7 @@ public :
BLAS_FUNC(trsm)(&right, &lower, &notrans, &nonunit, &N, &N, &fone, L, &N, X, &N); BLAS_FUNC(trsm)(&right, &lower, &notrans, &nonunit, &N, &N, &fone, L, &N, X, &N);
} }
static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){ static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){
BLAS_FUNC(trmm)(&left, &lower, &notrans,&nonunit, &N,&N,&fone,A,&N,B,&N); BLAS_FUNC(trmm)(&left, &lower, &notrans,&nonunit, &N,&N,&fone,A,&N,B,&N);
} }
@ -101,7 +100,6 @@ public :
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
int N2 = N*N; int N2 = N*N;
BLAS_FUNC(copy)(&N2, X, &intone, C, &intone); BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
char uplo = 'L';
int info = 0; int info = 0;
int * ipiv = (int*)alloca(sizeof(int)*N); int * ipiv = (int*)alloca(sizeof(int)*N);
int * jpiv = (int*)alloca(sizeof(int)*N); int * jpiv = (int*)alloca(sizeof(int)*N);
@ -134,8 +132,6 @@ public :
} }
char uplo = 'U'; char uplo = 'U';
int info = 0; int info = 0;
int ilo = 1;
int ihi = N;
int bsize = 64; int bsize = 64;
int worksize = N*bsize; int worksize = N*bsize;
SCALAR* d = new SCALAR[3*N+worksize]; SCALAR* d = new SCALAR[3*N+worksize];

View File

@ -17,12 +17,12 @@ public:
typedef real* gene_matrix; typedef real* gene_matrix;
typedef real* gene_vector; typedef real* gene_vector;
static void free_matrix(gene_matrix & A, int N){ static void free_matrix(gene_matrix & A, int /*N*/){
delete A; delete[] A;
} }
static void free_vector(gene_vector & B){ static void free_vector(gene_vector & B){
delete B; delete[] B;
} }
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){

View File

@ -56,13 +56,13 @@ int main()
bench<Action_trmm<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_trmm<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
#ifdef HAS_LAPACK #ifdef HAS_LAPACK
bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); // bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
#endif #endif
//bench<Action_lu_solve<blas_LU_solve_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT); //bench<Action_lu_solve<blas_LU_solve_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);

View File

@ -44,9 +44,9 @@ public :
return "STL"; return "STL";
} }
static void free_matrix(gene_matrix & A, int N){} static void free_matrix(gene_matrix & /*A*/, int /*N*/){}
static void free_vector(gene_vector & B){} static void free_vector(gene_vector & /*B*/){}
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
A = A_stl; A = A_stl;

View File

@ -0,0 +1,10 @@
find_package(BLAZE)
find_package(Boost)
if (BLAZE_FOUND AND Boost_FOUND)
include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
btl_add_bench(btl_blaze main.cpp)
if(BUILD_btl_blaze)
target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
endif()
endif ()

View File

@ -0,0 +1,140 @@
//=====================================================
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
//=====================================================
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
#ifndef BLAZE_INTERFACE_HH
#define BLAZE_INTERFACE_HH
#include <blaze/Math.h>
#include <blaze/Blaze.h>
// using namespace blaze;
#include <vector>
template<class real>
class blaze_interface {
public :
typedef real real_type ;
typedef std::vector<real> stl_vector;
typedef std::vector<stl_vector > stl_matrix;
typedef blaze::DynamicMatrix<real,blaze::columnMajor> gene_matrix;
typedef blaze::DynamicVector<real> gene_vector;
static inline std::string name() { return "blaze"; }
static void free_matrix(gene_matrix & A, int N){
return ;
}
static void free_vector(gene_vector & B){
return ;
}
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
A.resize(A_stl[0].size(), A_stl.size());
for (int j=0; j<A_stl.size() ; j++){
for (int i=0; i<A_stl[j].size() ; i++){
A(i,j) = A_stl[j][i];
}
}
}
static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
B.resize(B_stl.size());
for (int i=0; i<B_stl.size() ; i++){
B[i] = B_stl[i];
}
}
static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){
for (int i=0; i<B_stl.size() ; i++){
B_stl[i] = B[i];
}
}
static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
int N=A_stl.size();
for (int j=0;j<N;j++){
A_stl[j].resize(N);
for (int i=0;i<N;i++){
A_stl[j][i] = A(i,j);
}
}
}
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
X = (A*B);
}
static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
X = A.transpose()*B.transpose();
}
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
X = (A.transpose()*A);
}
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
X = (A*A.transpose());
}
static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
X = (A*B);
}
static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
X = (A.transpose()*B);
}
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
Y += coef * X;
}
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
Y = a*X + b*Y;
}
// static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
// C = X;
// recursive_cholesky(C);
// }
// static inline void lu_decomp(const gene_matrix & X, gene_matrix & R, int N){
// R = X;
// std::vector<int> ipvt(N);
// lu_factor(R, ipvt);
// }
// static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){
// X = lower_trisolve(L, B);
// }
static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
cible = source;
}
static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){
cible = source;
}
};
#endif

View File

@ -0,0 +1,40 @@
//=====================================================
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
//=====================================================
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
#include "utilities.h"
#include "blaze_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_axpy<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
bench<Action_axpby<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
bench<Action_matrix_vector_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
// bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
// bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
// bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
// bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
return 0;
}

View File

@ -45,9 +45,9 @@ public :
return EIGEN_MAKESTRING(BTL_PREFIX); return EIGEN_MAKESTRING(BTL_PREFIX);
} }
static void free_matrix(gene_matrix & A, int N) {} static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
static void free_vector(gene_vector & B) {} static void free_vector(gene_vector & /*B*/) {}
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
A.resize(A_stl[0].size(), A_stl.size()); A.resize(A_stl[0].size(), A_stl.size());
@ -84,28 +84,28 @@ public :
} }
} }
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){ static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
X.noalias() = A*B; X.noalias() = A*B;
} }
static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){ static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
X.noalias() = A.transpose()*B.transpose(); X.noalias() = A.transpose()*B.transpose();
} }
// static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){ // static inline void ata_product(const gene_matrix & A, gene_matrix & X, int /*N*/){
// X.noalias() = A.transpose()*A; // X.noalias() = A.transpose()*A;
// } // }
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){ static inline void aat_product(const gene_matrix & A, gene_matrix & X, int /*N*/){
X.template triangularView<Lower>().setZero(); X.template triangularView<Lower>().setZero();
X.template selfadjointView<Lower>().rankUpdate(A); X.template selfadjointView<Lower>().rankUpdate(A);
} }
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
X.noalias() = A*B; X.noalias() = A*B;
} }
static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
X.noalias() = (A.template selfadjointView<Lower>() * B); X.noalias() = (A.template selfadjointView<Lower>() * B);
// internal::product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1); // internal::product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
} }
@ -166,43 +166,43 @@ public :
A.col(j) += X * Y[j]; A.col(j) += X * Y[j];
} }
static EIGEN_DONT_INLINE void rot(gene_vector & A, gene_vector & B, real c, real s, int N){ static EIGEN_DONT_INLINE void rot(gene_vector & A, gene_vector & B, real c, real s, int /*N*/){
internal::apply_rotation_in_the_plane(A, B, JacobiRotation<real>(c,s)); internal::apply_rotation_in_the_plane(A, B, JacobiRotation<real>(c,s));
} }
static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int /*N*/){
X.noalias() = (A.transpose()*B); X.noalias() = (A.transpose()*B);
} }
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){
Y += coef * X; Y += coef * X;
} }
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){ static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){
Y = a*X + b*Y; Y = a*X + b*Y;
} }
static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){
cible = source; cible = source;
} }
static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){ static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){
cible = source; cible = source;
} }
static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){ static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int /*N*/){
X = L.template triangularView<Lower>().solve(B); X = L.template triangularView<Lower>().solve(B);
} }
static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){
X = L.template triangularView<Upper>().solve(B); X = L.template triangularView<Upper>().solve(B);
} }
static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){
X.noalias() = L.template triangularView<Lower>() * B; X.noalias() = L.template triangularView<Lower>() * B;
} }
static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){ static inline void cholesky(const gene_matrix & X, gene_matrix & C, int /*N*/){
C = X; C = X;
internal::llt_inplace<real,Lower>::blocked(C); internal::llt_inplace<real,Lower>::blocked(C);
//C = X.llt().matrixL(); //C = X.llt().matrixL();
@ -211,7 +211,7 @@ public :
// Cholesky<gene_matrix>::computeInPlaceBlock(C); // Cholesky<gene_matrix>::computeInPlaceBlock(C);
} }
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int /*N*/){
C = X.fullPivLu().matrixLU(); C = X.fullPivLu().matrixLU();
} }
@ -229,7 +229,7 @@ public :
internal::tridiagonalization_inplace(C, aux); internal::tridiagonalization_inplace(C, aux);
} }
static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){ static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int /*N*/){
C = HessenbergDecomposition<gene_matrix>(X).packedMatrix(); C = HessenbergDecomposition<gene_matrix>(X).packedMatrix();
} }

View File

@ -29,14 +29,14 @@ BTL_MAIN;
int main() int main()
{ {
bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); // bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); // bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
return 0; return 0;
} }

76
bench/dense_solvers.cpp Normal file
View File

@ -0,0 +1,76 @@
#include <iostream>
#include "BenchTimer.h"
#include <Eigen/Dense>
#include <map>
#include <string>
using namespace Eigen;
std::map<std::string,Array<float,1,4> > results;
template<typename Scalar,int Size>
void bench(int id, int size = Size)
{
typedef Matrix<Scalar,Size,Size> Mat;
Mat A(size,size);
A.setRandom();
A = A*A.adjoint();
BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd;
int tries = 3;
int rep = 1000/size;
if(rep==0) rep = 1;
rep = rep*rep;
LLT<Mat> llt(A);
LDLT<Mat> ldlt(A);
PartialPivLU<Mat> lu(A);
FullPivLU<Mat> fplu(A);
HouseholderQR<Mat> qr(A);
ColPivHouseholderQR<Mat> cpqr(A);
FullPivHouseholderQR<Mat> fpqr(A);
JacobiSVD<Mat> jsvd(A.rows(),A.cols());
BENCH(t_llt, tries, rep, llt.compute(A));
BENCH(t_ldlt, tries, rep, ldlt.compute(A));
BENCH(t_lu, tries, rep, lu.compute(A));
BENCH(t_fplu, tries, rep, fplu.compute(A));
BENCH(t_qr, tries, rep, qr.compute(A));
BENCH(t_cpqr, tries, rep, cpqr.compute(A));
BENCH(t_fpqr, tries, rep, fpqr.compute(A));
if(size<500) // JacobiSVD is really too slow for too large matrices
BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
results["LLT"][id] = t_llt.best();
results["LDLT"][id] = t_ldlt.best();
results["PartialPivLU"][id] = t_lu.best();
results["FullPivLU"][id] = t_fplu.best();
results["HouseholderQR"][id] = t_qr.best();
results["ColPivHouseholderQR"][id] = t_cpqr.best();
results["FullPivHouseholderQR"][id] = t_fpqr.best();
results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
}
int main()
{
const int small = 8;
const int medium = 100;
const int large = 1000;
const int xl = 4000;
bench<float,small>(0);
bench<float,Dynamic>(1,medium);
bench<float,Dynamic>(2,large);
bench<float,Dynamic>(3,xl);
IOFormat fmt(3, 0, " \t", "\n", "", "");
std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n";
std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
}

View File

@ -91,4 +91,5 @@ void printBenchStyle(std::ofstream& out)
</xsl:stylesheet>\n\n"; </xsl:stylesheet>\n\n";
} }
#endif #endif

View File

@ -1,9 +1,6 @@
This directory contains a BLAS library built on top of Eigen. This directory contains a BLAS library built on top of Eigen.
This is currently a work in progress which is far to be ready for use,
but feel free to contribute to it if you wish.
This module is not built by default. In order to compile it, you need to This module is not built by default. In order to compile it, you need to
type 'make blas' from within your build dir. type 'make blas' from within your build dir.

View File

@ -106,13 +106,13 @@ matrix(T* data, int rows, int cols, int stride)
} }
template<typename T> template<typename T>
Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > vector(T* data, int size, int incr) Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(T* data, int size, int incr)
{ {
return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr)); return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
} }
template<typename T> template<typename T>
Map<Matrix<T,Dynamic,1> > vector(T* data, int size) Map<Matrix<T,Dynamic,1> > make_vector(T* data, int size)
{ {
return Map<Matrix<T,Dynamic,1> >(data, size); return Map<Matrix<T,Dynamic,1> >(data, size);
} }
@ -124,8 +124,8 @@ T* get_compact_vector(T* x, int n, int incx)
return x; return x;
T* ret = new Scalar[n]; T* ret = new Scalar[n];
if(incx<0) vector(ret,n) = vector(x,n,-incx).reverse(); if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse();
else vector(ret,n) = vector(x,n, incx); else make_vector(ret,n) = make_vector(x,n, incx);
return ret; return ret;
} }
@ -135,8 +135,8 @@ T* copy_back(T* x_cpy, T* x, int n, int incx)
if(x_cpy==x) if(x_cpy==x)
return 0; return 0;
if(incx<0) vector(x,n,-incx).reverse() = vector(x_cpy,n); if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n);
else vector(x,n, incx) = vector(x_cpy,n); else make_vector(x,n, incx) = make_vector(x_cpy,n);
return x_cpy; return x_cpy;
} }

View File

@ -23,11 +23,10 @@ double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy)
{ {
if(*n<=0) return 0; if(*n<=0) return 0;
if(*incx==1 && *incy==1) return (vector(x,*n).cast<double>().cwiseProduct(vector(y,*n).cast<double>())).sum(); if(*incx==1 && *incy==1) return (make_vector(x,*n).cast<double>().cwiseProduct(make_vector(y,*n).cast<double>())).sum();
else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum(); else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum(); else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum(); else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum(); else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
else return 0; else return 0;
} }

View File

@ -32,13 +32,14 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n,
if(*n<=0) return 0; if(*n<=0) return 0;
if(*incx==1) return vector(x,*n).unaryExpr<scalar_norm1_op>().sum(); if(*incx==1) return make_vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
else return vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum(); else return make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
} }
// computes a dot product of a conjugated vector with another vector. // computes a dot product of a conjugated vector with another vector.
int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres) int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
{ {
// std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
Scalar* res = reinterpret_cast<Scalar*>(pres); Scalar* res = reinterpret_cast<Scalar*>(pres);
if(*n<=0) if(*n<=0)
@ -50,11 +51,11 @@ int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) *res = (vector(x,*n).dot(vector(y,*n))); if(*incx==1 && *incy==1) *res = (make_vector(x,*n).dot(make_vector(y,*n)));
else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).dot(vector(y,*n,*incy))); else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy)));
else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,*incy))); else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy)));
else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).dot(vector(y,*n,-*incy).reverse())); else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse()));
else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,-*incy).reverse())); else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse()));
return 0; return 0;
} }
@ -72,11 +73,11 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) *res = (vector(x,*n).cwiseProduct(vector(y,*n))).sum(); if(*incx==1 && *incy==1) *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum(); else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum(); else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
return 0; return 0;
} }
@ -88,9 +89,9 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n,
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
if(*incx==1) if(*incx==1)
return vector(x,*n).stableNorm(); return make_vector(x,*n).stableNorm();
return vector(x,*n,*incx).stableNorm(); return make_vector(x,*n,*incx).stableNorm();
} }
int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
@ -102,8 +103,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal
RealScalar c = *pc; RealScalar c = *pc;
RealScalar s = *ps; RealScalar s = *ps;
StridedVectorType vx(vector(x,*n,std::abs(*incx))); StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
StridedVectorType vy(vector(y,*n,std::abs(*incy))); StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
Reverse<StridedVectorType> rvx(vx); Reverse<StridedVectorType> rvx(vx);
Reverse<StridedVectorType> rvy(vy); Reverse<StridedVectorType> rvy(vy);
@ -125,9 +126,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca
// std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n"; // std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
if(*incx==1) vector(x,*n) *= alpha; if(*incx==1) make_vector(x,*n) *= alpha;
else vector(x,*n,std::abs(*incx)) *= alpha; else make_vector(x,*n,std::abs(*incx)) *= alpha;
return 0; return 0;
} }

View File

@ -17,11 +17,11 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx,
if(*n<=0) return 0; if(*n<=0) return 0;
if(*incx==1 && *incy==1) vector(y,*n) += alpha * vector(x,*n); if(*incx==1 && *incy==1) make_vector(y,*n) += alpha * make_vector(x,*n);
else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx); else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx);
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx); else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx);
else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse(); else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse();
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse(); else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse();
return 0; return 0;
} }
@ -35,7 +35,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
// be carefull, *incx==0 is allowed !! // be carefull, *incx==0 is allowed !!
if(*incx==1 && *incy==1) if(*incx==1 && *incy==1)
vector(y,*n) = vector(x,*n); make_vector(y,*n) = make_vector(x,*n);
else else
{ {
if(*incx<0) x = x - (*n-1)*(*incx); if(*incx<0) x = x - (*n-1)*(*incx);
@ -57,8 +57,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
DenseIndex ret; DenseIndex ret;
if(*incx==1) vector(x,*n).cwiseAbs().maxCoeff(&ret); if(*incx==1) make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
else vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret); else make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
return ret+1; return ret+1;
} }
@ -68,8 +68,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *inc
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
DenseIndex ret; DenseIndex ret;
if(*incx==1) vector(x,*n).cwiseAbs().minCoeff(&ret); if(*incx==1) make_vector(x,*n).cwiseAbs().minCoeff(&ret);
else vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret); else make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
return ret+1; return ret+1;
} }
@ -143,8 +143,8 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
if(*incx==1) vector(x,*n) *= alpha; if(*incx==1) make_vector(x,*n) *= alpha;
else vector(x,*n,std::abs(*incx)) *= alpha; else make_vector(x,*n,std::abs(*incx)) *= alpha;
return 0; return 0;
} }
@ -156,12 +156,11 @@ int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) vector(y,*n).swap(vector(x,*n)); if(*incx==1 && *incy==1) make_vector(y,*n).swap(make_vector(x,*n));
else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx)); else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx));
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx)); else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx));
else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse()); else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse());
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse()); else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse());
return 1; return 1;
} }

View File

@ -19,8 +19,8 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
if(*n<=0) return 0; if(*n<=0) return 0;
if(*incx==1) return vector(x,*n).cwiseAbs().sum(); if(*incx==1) return make_vector(x,*n).cwiseAbs().sum();
else return vector(x,*n,std::abs(*incx)).cwiseAbs().sum(); else return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
} }
// computes a vector-vector dot product. // computes a vector-vector dot product.
@ -33,11 +33,11 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) return (vector(x,*n).cwiseProduct(vector(y,*n))).sum(); if(*incx==1 && *incy==1) return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum(); else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum(); else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum(); else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
else return 0; else return 0;
} }
@ -50,8 +50,8 @@ Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
if(*incx==1) return vector(x,*n).stableNorm(); if(*incx==1) return make_vector(x,*n).stableNorm();
else return vector(x,*n,std::abs(*incx)).stableNorm(); else return make_vector(x,*n,std::abs(*incx)).stableNorm();
} }
int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
@ -64,8 +64,8 @@ int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int
Scalar c = *reinterpret_cast<Scalar*>(pc); Scalar c = *reinterpret_cast<Scalar*>(pc);
Scalar s = *reinterpret_cast<Scalar*>(ps); Scalar s = *reinterpret_cast<Scalar*>(ps);
StridedVectorType vx(vector(x,*n,std::abs(*incx))); StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
StridedVectorType vy(vector(y,*n,std::abs(*incy))); StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
Reverse<StridedVectorType> rvx(vx); Reverse<StridedVectorType> rvx(vx);
Reverse<StridedVectorType> rvy(vy); Reverse<StridedVectorType> rvy(vy);

View File

@ -57,8 +57,8 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
if(beta!=Scalar(1)) if(beta!=Scalar(1))
{ {
if(beta==Scalar(0)) vector(actual_y, *n).setZero(); if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
else vector(actual_y, *n) *= beta; else make_vector(actual_y, *n) *= beta;
} }
if(alpha!=Scalar(0)) if(alpha!=Scalar(0))

View File

@ -58,8 +58,8 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
if(beta!=Scalar(1)) if(beta!=Scalar(1))
{ {
if(beta==Scalar(0)) vector(actual_c, actual_m).setZero(); if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero();
else vector(actual_c, actual_m) *= beta; else make_vector(actual_c, actual_m) *= beta;
} }
if(code>=4 || func[code]==0) if(code>=4 || func[code]==0)
@ -232,8 +232,8 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
if(beta!=Scalar(1)) if(beta!=Scalar(1))
{ {
if(beta==Scalar(0)) vector(actual_y, actual_m).setZero(); if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero();
else vector(actual_y, actual_m) *= beta; else make_vector(actual_y, actual_m) *= beta;
} }
MatrixType mat_coeffs(a,coeff_rows,*n,*lda); MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
@ -246,11 +246,11 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
int len = end - start + 1; int len = end - start + 1;
int offset = (*ku) - j + start; int offset = (*ku) - j + start;
if(OP(*trans)==NOTR) if(OP(*trans)==NOTR)
vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len); make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
else if(OP(*trans)==TR) else if(OP(*trans)==TR)
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value(); actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
else else
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * vector(actual_x+start,len) ).value(); actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
} }
if(actual_x!=x) delete[] actual_x; if(actual_x!=x) delete[] actual_x;
@ -304,11 +304,11 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
int offset = (ku) - j + start; int offset = (ku) - j + start;
if(OP(*trans)==NOTR) if(OP(*trans)==NOTR)
vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len); make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
else if(OP(*trans)==TR) else if(OP(*trans)==TR)
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value(); actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
else else
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * vector(actual_x+start,len) ).value(); actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
} }
if(actual_x!=x) delete[] actual_x; if(actual_x!=x) delete[] actual_x;
@ -521,4 +521,3 @@ int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
return 1; return 1;
} }

View File

@ -51,8 +51,8 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
if(beta!=Scalar(1)) if(beta!=Scalar(1))
{ {
if(beta==Scalar(0)) vector(actual_y, *n).setZero(); if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
else vector(actual_y, *n) *= beta; else make_vector(actual_y, *n) *= beta;
} }
int code = UPLO(*uplo); int code = UPLO(*uplo);
@ -366,5 +366,3 @@ int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx,
return 1; return 1;
} }

View File

@ -1,7 +1,7 @@
#include <iostream> #include <iostream>
#if (defined __GNUC__) #if (defined __GNUC__) && (!defined __MINGW32__)
#define EIGEN_WEAK_LINKING __attribute__ ((weak)) #define EIGEN_WEAK_LINKING __attribute__ ((weak))
#else #else
#define EIGEN_WEAK_LINKING #define EIGEN_WEAK_LINKING

View File

@ -3,26 +3,26 @@
# Eigen3Config.cmake(.in) # Eigen3Config.cmake(.in)
# Use the following variables to compile and link against Eigen: # Use the following variables to compile and link against Eigen:
# EIGEN_FOUND - True if Eigen was found on your system # EIGEN3_FOUND - True if Eigen was found on your system
# EIGEN_USE_FILE - The file making Eigen usable # EIGEN3_USE_FILE - The file making Eigen usable
# EIGEN_DEFINITIONS - Definitions needed to build with Eigen # EIGEN3_DEFINITIONS - Definitions needed to build with Eigen
# EIGEN_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found # EIGEN3_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found
# EIGEN_INCLUDE_DIRS - List of directories of Eigen and it's dependencies # EIGEN3_INCLUDE_DIRS - List of directories of Eigen and it's dependencies
# EIGEN_ROOT_DIR - The base directory of Eigen # EIGEN3_ROOT_DIR - The base directory of Eigen
# EIGEN_VERSION_STRING - A human-readable string containing the version # EIGEN3_VERSION_STRING - A human-readable string containing the version
# EIGEN_VERSION_MAJOR - The major version of Eigen # EIGEN3_VERSION_MAJOR - The major version of Eigen
# EIGEN_VERSION_MINOR - The minor version of Eigen # EIGEN3_VERSION_MINOR - The minor version of Eigen
# EIGEN_VERSION_PATCH - The patch version of Eigen # EIGEN3_VERSION_PATCH - The patch version of Eigen
set ( EIGEN_FOUND 1 ) set ( EIGEN3_FOUND 1 )
set ( EIGEN_USE_FILE "@EIGEN_USE_FILE@" ) set ( EIGEN3_USE_FILE "@EIGEN_USE_FILE@" )
set ( EIGEN_DEFINITIONS "@EIGEN_DEFINITIONS@" ) set ( EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@" )
set ( EIGEN_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" ) set ( EIGEN3_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" )
set ( EIGEN_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" ) set ( EIGEN3_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" )
set ( EIGEN_ROOT_DIR "@EIGEN_ROOT_DIR@" ) set ( EIGEN3_ROOT_DIR "@EIGEN_ROOT_DIR@" )
set ( EIGEN_VERSION_STRING "@EIGEN_VERSION_STRING@" ) set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
set ( EIGEN_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) set ( EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" )
set ( EIGEN_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) set ( EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@" )
set ( EIGEN_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) set ( EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@" )

Some files were not shown because too many files have changed in this diff Show More