mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-10 16:33:15 +08:00
merge with default branch
This commit is contained in:
commit
b29b81a1f4
@ -196,6 +196,18 @@ if(NOT MSVC)
|
|||||||
message(STATUS "Enabling SSE4.2 in tests/examples")
|
message(STATUS "Enabling SSE4.2 in tests/examples")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
|
||||||
|
if(EIGEN_TEST_AVX)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
|
||||||
|
message(STATUS "Enabling AVX in tests/examples")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
|
||||||
|
if(EIGEN_TEST_FMA)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
|
||||||
|
message(STATUS "Enabling FMA in tests/examples")
|
||||||
|
endif()
|
||||||
|
|
||||||
option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
|
option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
|
||||||
if(EIGEN_TEST_ALTIVEC)
|
if(EIGEN_TEST_ALTIVEC)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
|
||||||
@ -437,12 +449,12 @@ set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
|
|||||||
set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
|
set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
|
||||||
set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
|
set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
|
||||||
|
|
||||||
configure_file ( ${CMAKE_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
|
configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
|
||||||
@ONLY ESCAPE_QUOTES
|
@ONLY ESCAPE_QUOTES
|
||||||
)
|
)
|
||||||
|
|
||||||
install ( FILES ${CMAKE_SOURCE_DIR}/cmake/UseEigen3.cmake
|
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
|
||||||
DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
|
DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
|
||||||
)
|
)
|
||||||
|
@ -10,9 +10,11 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
|
* This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
|
||||||
* Those decompositions are accessible via the following MatrixBase methods:
|
* Those decompositions are also accessible via the following methods:
|
||||||
* - MatrixBase::llt(),
|
* - MatrixBase::llt()
|
||||||
* - MatrixBase::ldlt()
|
* - MatrixBase::ldlt()
|
||||||
|
* - SelfAdjointView::llt()
|
||||||
|
* - SelfAdjointView::ldlt()
|
||||||
*
|
*
|
||||||
* \code
|
* \code
|
||||||
* #include <Eigen/Cholesky>
|
* #include <Eigen/Cholesky>
|
||||||
|
28
Eigen/Core
28
Eigen/Core
@ -117,7 +117,16 @@
|
|||||||
#ifdef __SSE4_2__
|
#ifdef __SSE4_2__
|
||||||
#define EIGEN_VECTORIZE_SSE4_2
|
#define EIGEN_VECTORIZE_SSE4_2
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef __AVX__
|
||||||
|
#define EIGEN_VECTORIZE_AVX
|
||||||
|
#define EIGEN_VECTORIZE_SSE3
|
||||||
|
#define EIGEN_VECTORIZE_SSSE3
|
||||||
|
#define EIGEN_VECTORIZE_SSE4_1
|
||||||
|
#define EIGEN_VECTORIZE_SSE4_2
|
||||||
|
#endif
|
||||||
|
#ifdef __FMA__
|
||||||
|
#define EIGEN_VECTORIZE_FMA
|
||||||
|
#endif
|
||||||
// include files
|
// include files
|
||||||
|
|
||||||
// This extern "C" works around a MINGW-w64 compilation issue
|
// This extern "C" works around a MINGW-w64 compilation issue
|
||||||
@ -130,7 +139,7 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
|
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
|
||||||
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
|
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
|
||||||
#ifdef __INTEL_COMPILER
|
#if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#else
|
#else
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
@ -147,6 +156,9 @@
|
|||||||
#ifdef EIGEN_VECTORIZE_SSE4_2
|
#ifdef EIGEN_VECTORIZE_SSE4_2
|
||||||
#include <nmmintrin.h>
|
#include <nmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef EIGEN_VECTORIZE_AVX
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
} // end extern "C"
|
} // end extern "C"
|
||||||
#elif defined __ALTIVEC__
|
#elif defined __ALTIVEC__
|
||||||
@ -216,7 +228,9 @@
|
|||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
inline static const char *SimdInstructionSetsInUse(void) {
|
inline static const char *SimdInstructionSetsInUse(void) {
|
||||||
#if defined(EIGEN_VECTORIZE_SSE4_2)
|
#if defined(EIGEN_VECTORIZE_AVX)
|
||||||
|
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||||
|
#elif defined(EIGEN_VECTORIZE_SSE4_2)
|
||||||
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||||
#elif defined(EIGEN_VECTORIZE_SSE4_1)
|
#elif defined(EIGEN_VECTORIZE_SSE4_1)
|
||||||
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
|
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
|
||||||
@ -294,7 +308,13 @@ using std::ptrdiff_t;
|
|||||||
#include "src/Core/MathFunctions.h"
|
#include "src/Core/MathFunctions.h"
|
||||||
#include "src/Core/GenericPacketMath.h"
|
#include "src/Core/GenericPacketMath.h"
|
||||||
|
|
||||||
#if defined EIGEN_VECTORIZE_SSE
|
#if defined EIGEN_VECTORIZE_AVX
|
||||||
|
// Use AVX for floats and doubles, SSE for integers
|
||||||
|
#include "src/Core/arch/SSE/PacketMath.h"
|
||||||
|
#include "src/Core/arch/SSE/Complex.h"
|
||||||
|
#include "src/Core/arch/AVX/PacketMath.h"
|
||||||
|
#include "src/Core/arch/AVX/Complex.h"
|
||||||
|
#elif defined EIGEN_VECTORIZE_SSE
|
||||||
#include "src/Core/arch/SSE/PacketMath.h"
|
#include "src/Core/arch/SSE/PacketMath.h"
|
||||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||||
#include "src/Core/arch/SSE/Complex.h"
|
#include "src/Core/arch/SSE/Complex.h"
|
||||||
|
@ -47,7 +47,9 @@
|
|||||||
#include "src/Geometry/AlignedBox.h"
|
#include "src/Geometry/AlignedBox.h"
|
||||||
#include "src/Geometry/Umeyama.h"
|
#include "src/Geometry/Umeyama.h"
|
||||||
|
|
||||||
#if defined EIGEN_VECTORIZE_SSE
|
// Use the SSE optimized version whenever possible. At the moment the
|
||||||
|
// SSE version doesn't compile when AVX is enabled
|
||||||
|
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
|
||||||
#include "src/Geometry/arch/Geometry_SSE.h"
|
#include "src/Geometry/arch/Geometry_SSE.h"
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
4
Eigen/LU
4
Eigen/LU
@ -27,7 +27,9 @@
|
|||||||
#include "src/LU/Determinant.h"
|
#include "src/LU/Determinant.h"
|
||||||
#include "src/LU/InverseImpl.h"
|
#include "src/LU/InverseImpl.h"
|
||||||
|
|
||||||
#if defined EIGEN_VECTORIZE_SSE
|
// Use the SSE optimized version whenever possible. At the moment the
|
||||||
|
// SSE version doesn't compile when AVX is enabled
|
||||||
|
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
|
||||||
#include "src/LU/arch/Inverse_SSE.h"
|
#include "src/LU/arch/Inverse_SSE.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ namespace internal {
|
|||||||
* Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
|
* Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
|
||||||
* decomposition to determine whether a system of equations has a solution.
|
* decomposition to determine whether a system of equations has a solution.
|
||||||
*
|
*
|
||||||
* \sa MatrixBase::ldlt(), class LLT
|
* \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
|
||||||
*/
|
*/
|
||||||
template<typename _MatrixType, int _UpLo> class LDLT
|
template<typename _MatrixType, int _UpLo> class LDLT
|
||||||
{
|
{
|
||||||
@ -179,7 +179,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
|||||||
* least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
|
* least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
|
||||||
* computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
|
* computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
|
||||||
*
|
*
|
||||||
* \sa MatrixBase::ldlt()
|
* \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
|
||||||
*/
|
*/
|
||||||
#ifdef EIGEN_TEST_EVALUATORS
|
#ifdef EIGEN_TEST_EVALUATORS
|
||||||
template<typename Rhs>
|
template<typename Rhs>
|
||||||
@ -610,6 +610,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
|
|||||||
#ifndef __CUDACC__
|
#ifndef __CUDACC__
|
||||||
/** \cholesky_module
|
/** \cholesky_module
|
||||||
* \returns the Cholesky decomposition with full pivoting without square root of \c *this
|
* \returns the Cholesky decomposition with full pivoting without square root of \c *this
|
||||||
|
* \sa MatrixBase::ldlt()
|
||||||
*/
|
*/
|
||||||
template<typename MatrixType, unsigned int UpLo>
|
template<typename MatrixType, unsigned int UpLo>
|
||||||
inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
|
inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
|
||||||
@ -620,6 +621,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const
|
|||||||
|
|
||||||
/** \cholesky_module
|
/** \cholesky_module
|
||||||
* \returns the Cholesky decomposition with full pivoting without square root of \c *this
|
* \returns the Cholesky decomposition with full pivoting without square root of \c *this
|
||||||
|
* \sa SelfAdjointView::ldlt()
|
||||||
*/
|
*/
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
inline const LDLT<typename MatrixBase<Derived>::PlainObject>
|
inline const LDLT<typename MatrixBase<Derived>::PlainObject>
|
||||||
|
@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
|||||||
* Example: \include LLT_example.cpp
|
* Example: \include LLT_example.cpp
|
||||||
* Output: \verbinclude LLT_example.out
|
* Output: \verbinclude LLT_example.out
|
||||||
*
|
*
|
||||||
* \sa MatrixBase::llt(), class LDLT
|
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
||||||
*/
|
*/
|
||||||
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
||||||
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
|
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
|
||||||
@ -115,7 +115,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
|||||||
* Example: \include LLT_solve.cpp
|
* Example: \include LLT_solve.cpp
|
||||||
* Output: \verbinclude LLT_solve.out
|
* Output: \verbinclude LLT_solve.out
|
||||||
*
|
*
|
||||||
* \sa solveInPlace(), MatrixBase::llt()
|
* \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
|
||||||
*/
|
*/
|
||||||
#ifdef EIGEN_TEST_EVALUATORS
|
#ifdef EIGEN_TEST_EVALUATORS
|
||||||
template<typename Rhs>
|
template<typename Rhs>
|
||||||
@ -498,6 +498,7 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
|
|||||||
#ifndef __CUDACC__
|
#ifndef __CUDACC__
|
||||||
/** \cholesky_module
|
/** \cholesky_module
|
||||||
* \returns the LLT decomposition of \c *this
|
* \returns the LLT decomposition of \c *this
|
||||||
|
* \sa SelfAdjointView::llt()
|
||||||
*/
|
*/
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
inline const LLT<typename MatrixBase<Derived>::PlainObject>
|
inline const LLT<typename MatrixBase<Derived>::PlainObject>
|
||||||
@ -508,6 +509,7 @@ MatrixBase<Derived>::llt() const
|
|||||||
|
|
||||||
/** \cholesky_module
|
/** \cholesky_module
|
||||||
* \returns the LLT decomposition of \c *this
|
* \returns the LLT decomposition of \c *this
|
||||||
|
* \sa SelfAdjointView::llt()
|
||||||
*/
|
*/
|
||||||
template<typename MatrixType, unsigned int UpLo>
|
template<typename MatrixType, unsigned int UpLo>
|
||||||
inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
|
inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
|
||||||
|
@ -105,6 +105,8 @@ public:
|
|||||||
EIGEN_DEBUG_VAR(DstIsAligned)
|
EIGEN_DEBUG_VAR(DstIsAligned)
|
||||||
EIGEN_DEBUG_VAR(SrcIsAligned)
|
EIGEN_DEBUG_VAR(SrcIsAligned)
|
||||||
EIGEN_DEBUG_VAR(JointAlignment)
|
EIGEN_DEBUG_VAR(JointAlignment)
|
||||||
|
EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
|
||||||
|
EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
|
||||||
EIGEN_DEBUG_VAR(InnerSize)
|
EIGEN_DEBUG_VAR(InnerSize)
|
||||||
EIGEN_DEBUG_VAR(InnerMaxSize)
|
EIGEN_DEBUG_VAR(InnerMaxSize)
|
||||||
EIGEN_DEBUG_VAR(PacketSize)
|
EIGEN_DEBUG_VAR(PacketSize)
|
||||||
|
@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
|
|||||||
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos)
|
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos, Cos)
|
||||||
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
|
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
|
||||||
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan)
|
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan, Tan)
|
||||||
|
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan, Atan)
|
||||||
//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs)
|
//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs, Abs)
|
||||||
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp)
|
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp, Exp)
|
||||||
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln)
|
EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log, Ln)
|
||||||
|
@ -88,8 +88,8 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
|
|||||||
MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
|
MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
|
||||||
&& (InnerStrideAtCompileTime == 1)
|
&& (InnerStrideAtCompileTime == 1)
|
||||||
? PacketAccessBit : 0,
|
? PacketAccessBit : 0,
|
||||||
MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
|
MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
|
||||||
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
|
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
|
||||||
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
|
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
|
||||||
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
|
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
|
||||||
Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
|
Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
|
||||||
|
@ -40,7 +40,7 @@ void check_static_allocation_size()
|
|||||||
*/
|
*/
|
||||||
template <typename T, int Size, int MatrixOrArrayOptions,
|
template <typename T, int Size, int MatrixOrArrayOptions,
|
||||||
int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
|
int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
|
||||||
: (((Size*sizeof(T))%16)==0) ? 16
|
: (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
|
||||||
: 0 >
|
: 0 >
|
||||||
struct plain_array
|
struct plain_array
|
||||||
{
|
{
|
||||||
@ -68,27 +68,27 @@ struct plain_array
|
|||||||
template<typename PtrType>
|
template<typename PtrType>
|
||||||
EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
|
EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
|
||||||
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
|
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
|
||||||
eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
|
eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
|
||||||
&& "this assertion is explained here: " \
|
&& "this assertion is explained here: " \
|
||||||
"http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
|
"http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
|
||||||
" **** READ THIS WEB PAGE !!! ****");
|
" **** READ THIS WEB PAGE !!! ****");
|
||||||
#else
|
#else
|
||||||
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
|
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
|
||||||
eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
|
eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
|
||||||
&& "this assertion is explained here: " \
|
&& "this assertion is explained here: " \
|
||||||
"http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
|
"http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
|
||||||
" **** READ THIS WEB PAGE !!! ****");
|
" **** READ THIS WEB PAGE !!! ****");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <typename T, int Size, int MatrixOrArrayOptions>
|
template <typename T, int Size, int MatrixOrArrayOptions>
|
||||||
struct plain_array<T, Size, MatrixOrArrayOptions, 16>
|
struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
|
||||||
{
|
{
|
||||||
EIGEN_USER_ALIGN16 T array[Size];
|
EIGEN_USER_ALIGN_DEFAULT T array[Size];
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
plain_array()
|
plain_array()
|
||||||
{
|
{
|
||||||
EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
|
EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
|
||||||
check_static_allocation_size<T,Size>();
|
check_static_allocation_size<T,Size>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,7 +102,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
|
|||||||
template <typename T, int MatrixOrArrayOptions, int Alignment>
|
template <typename T, int MatrixOrArrayOptions, int Alignment>
|
||||||
struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
|
struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
|
||||||
{
|
{
|
||||||
EIGEN_USER_ALIGN16 T array[1];
|
EIGEN_USER_ALIGN_DEFAULT T array[1];
|
||||||
EIGEN_DEVICE_FUNC plain_array() {}
|
EIGEN_DEVICE_FUNC plain_array() {}
|
||||||
EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
|
EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
|
||||||
};
|
};
|
||||||
|
@ -69,8 +69,7 @@ template<typename Lhs, typename Rhs> struct product_type
|
|||||||
MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
|
MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
|
||||||
_Rhs::MaxRowsAtCompileTime),
|
_Rhs::MaxRowsAtCompileTime),
|
||||||
Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
|
Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
|
||||||
_Rhs::RowsAtCompileTime),
|
_Rhs::RowsAtCompileTime)
|
||||||
LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// the splitting into different lines of code here, introducing the _select enums and the typedef below,
|
// the splitting into different lines of code here, introducing the _select enums and the typedef below,
|
||||||
@ -424,7 +423,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
|
|||||||
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
|
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
|
||||||
EIGEN_STRONG_INLINE Scalar* data() {
|
EIGEN_STRONG_INLINE Scalar* data() {
|
||||||
return ForceAlignment
|
return ForceAlignment
|
||||||
? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
|
? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
|
||||||
: m_data.array;
|
: m_data.array;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
106
Eigen/src/Core/GenericPacketMath.h
Normal file → Executable file
106
Eigen/src/Core/GenericPacketMath.h
Normal file → Executable file
@ -42,6 +42,8 @@ namespace internal {
|
|||||||
struct default_packet_traits
|
struct default_packet_traits
|
||||||
{
|
{
|
||||||
enum {
|
enum {
|
||||||
|
HasHalfPacket = 0,
|
||||||
|
|
||||||
HasAdd = 1,
|
HasAdd = 1,
|
||||||
HasSub = 1,
|
HasSub = 1,
|
||||||
HasMul = 1,
|
HasMul = 1,
|
||||||
@ -71,10 +73,12 @@ struct default_packet_traits
|
|||||||
template<typename T> struct packet_traits : default_packet_traits
|
template<typename T> struct packet_traits : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef T type;
|
typedef T type;
|
||||||
|
typedef T half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 0,
|
Vectorizable = 0,
|
||||||
size = 1,
|
size = 1,
|
||||||
AlignedOnScalar = 0
|
AlignedOnScalar = 0,
|
||||||
|
HasHalfPacket = 0
|
||||||
};
|
};
|
||||||
enum {
|
enum {
|
||||||
HasAdd = 0,
|
HasAdd = 0,
|
||||||
@ -157,18 +161,66 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
|||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||||
|
|
||||||
/** \internal \returns a packet with elements of \a *from duplicated.
|
|
||||||
* For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
|
|
||||||
* duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
|
|
||||||
* Currently, this function is only used for scalar * complex products.
|
|
||||||
*/
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
|
||||||
|
|
||||||
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
|
/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
|
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||||
|
|
||||||
|
/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
|
||||||
|
|
||||||
|
/** \internal \returns a packet with elements of \a *from duplicated.
|
||||||
|
* For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
|
||||||
|
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
|
||||||
|
* Currently, this function is only used for scalar * complex products.
|
||||||
|
*/
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||||
|
|
||||||
|
/** \internal \returns a packet with elements of \a *from quadrupled.
|
||||||
|
* For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
|
||||||
|
* replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
|
||||||
|
* Currently, this function is only used in matrix products.
|
||||||
|
* For packet-size smaller or equal to 4, this function is equivalent to pload1
|
||||||
|
*/
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
ploadquad(const typename unpacket_traits<Packet>::type* from)
|
||||||
|
{ return pload1<Packet>(from); }
|
||||||
|
|
||||||
|
/** \internal equivalent to
|
||||||
|
* \code
|
||||||
|
* a0 = pload1(a+0);
|
||||||
|
* a1 = pload1(a+1);
|
||||||
|
* a2 = pload1(a+2);
|
||||||
|
* a3 = pload1(a+3);
|
||||||
|
* \endcode
|
||||||
|
* \sa pset1, pload1, ploaddup, pbroadcast2
|
||||||
|
*/
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC
|
||||||
|
inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
|
||||||
|
Packet& a0, Packet& a1, Packet& a2, Packet& a3)
|
||||||
|
{
|
||||||
|
a0 = pload1<Packet>(a+0);
|
||||||
|
a1 = pload1<Packet>(a+1);
|
||||||
|
a2 = pload1<Packet>(a+2);
|
||||||
|
a3 = pload1<Packet>(a+3);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** \internal equivalent to
|
||||||
|
* \code
|
||||||
|
* a0 = pload1(a+0);
|
||||||
|
* a1 = pload1(a+1);
|
||||||
|
* \endcode
|
||||||
|
* \sa pset1, pload1, ploaddup, pbroadcast4
|
||||||
|
*/
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC
|
||||||
|
inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
|
||||||
|
Packet& a0, Packet& a1)
|
||||||
|
{
|
||||||
|
a0 = pload1<Packet>(a+0);
|
||||||
|
a1 = pload1<Packet>(a+1);
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
|
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
|
||||||
template<typename Scalar> inline typename packet_traits<Scalar>::type
|
template<typename Scalar> inline typename packet_traits<Scalar>::type
|
||||||
plset(const Scalar& a) { return a; }
|
plset(const Scalar& a) { return a; }
|
||||||
@ -179,7 +231,13 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
|
|||||||
|
|
||||||
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
|
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
|
||||||
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
|
||||||
{ (*to) = from; }
|
{ (*to) = from; }
|
||||||
|
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/)
|
||||||
|
{ return ploadu<Packet>(from); }
|
||||||
|
|
||||||
|
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/)
|
||||||
|
{ pstore(to, from); }
|
||||||
|
|
||||||
/** \internal tries to do cache prefetching of \a addr */
|
/** \internal tries to do cache prefetching of \a addr */
|
||||||
template<typename Scalar> inline void prefetch(const Scalar* addr)
|
template<typename Scalar> inline void prefetch(const Scalar* addr)
|
||||||
@ -201,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; }
|
|||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
|
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
|
||||||
{ return a; }
|
{ return a; }
|
||||||
|
|
||||||
|
/** \internal \returns the sum of the elements of \a a by block of 4 elements.
|
||||||
|
* For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
|
||||||
|
* For packet-size smaller or equal to 4, this boils down to a noop.
|
||||||
|
*/
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline
|
||||||
|
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
|
||||||
|
predux4(const Packet& a)
|
||||||
|
{ return a; }
|
||||||
|
|
||||||
/** \internal \returns the product of the elements of \a a*/
|
/** \internal \returns the product of the elements of \a a*/
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
|
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
|
||||||
{ return a; }
|
{ return a; }
|
||||||
@ -251,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
|
|||||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||||
Packet pacos(const Packet& a) { using std::acos; return acos(a); }
|
Packet pacos(const Packet& a) { using std::acos; return acos(a); }
|
||||||
|
|
||||||
|
/** \internal \returns the atan of \a a (coeff-wise) */
|
||||||
|
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||||
|
Packet patan(const Packet& a) { using std::atan; return atan(a); }
|
||||||
|
|
||||||
/** \internal \returns the exp of \a a (coeff-wise) */
|
/** \internal \returns the exp of \a a (coeff-wise) */
|
||||||
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
|
||||||
Packet pexp(const Packet& a) { using std::exp; return exp(a); }
|
Packet pexp(const Packet& a) { using std::exp; return exp(a); }
|
||||||
@ -348,9 +419,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************************************
|
||||||
|
* PacketBlock, that is a collection of N packets where the number of words
|
||||||
|
* in the packet is a multiple of N.
|
||||||
|
***************************************************************************/
|
||||||
|
template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
|
||||||
|
Packet packet[N];
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
|
||||||
|
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_GENERIC_PACKET_MATH_H
|
#endif // EIGEN_GENERIC_PACKET_MATH_H
|
||||||
|
|
||||||
|
@ -45,6 +45,7 @@ namespace Eigen
|
|||||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
|
||||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
|
||||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
|
||||||
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
|
||||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
|
||||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
|
||||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
|
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
|
||||||
|
@ -89,7 +89,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
|
|||||||
&& ( bool(IsDynamicSize)
|
&& ( bool(IsDynamicSize)
|
||||||
|| HasNoOuterStride
|
|| HasNoOuterStride
|
||||||
|| ( OuterStrideAtCompileTime!=Dynamic
|
|| ( OuterStrideAtCompileTime!=Dynamic
|
||||||
&& ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
|
&& ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
|
||||||
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
||||||
Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
|
Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
|
||||||
Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
|
Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
|
||||||
|
@ -166,11 +166,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
|||||||
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
|
EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
|
||||||
internal::inner_stride_at_compile_time<Derived>::ret==1),
|
internal::inner_stride_at_compile_time<Derived>::ret==1),
|
||||||
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
|
PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
|
||||||
eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0) && "data is not aligned");
|
eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
|
||||||
#else
|
#else
|
||||||
eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % 16) == 0) && "data is not aligned");
|
eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PointerType m_data;
|
PointerType m_data;
|
||||||
@ -255,6 +254,8 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
|
|||||||
using Base::Base::operator=;
|
using Base::Base::operator=;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_MAPBASE_H
|
#endif // EIGEN_MAPBASE_H
|
||||||
|
@ -669,6 +669,15 @@ bool (isfinite)(const T& x)
|
|||||||
return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
|
return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
bool (isfinite)(const std::complex<T>& x)
|
||||||
|
{
|
||||||
|
using std::real;
|
||||||
|
using std::imag;
|
||||||
|
return isfinite(real(x)) && isfinite(imag(x));
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace numext
|
} // end namespace numext
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
@ -325,10 +325,16 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
|
|||||||
};
|
};
|
||||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||||
{
|
{
|
||||||
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
|
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||||
if (VectorizedSize != Size)
|
if (VectorizedSize > 0) {
|
||||||
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
|
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
|
||||||
return res;
|
if (VectorizedSize != Size)
|
||||||
|
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -97,7 +97,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
|
|||||||
template<typename Derived> struct match {
|
template<typename Derived> struct match {
|
||||||
enum {
|
enum {
|
||||||
HasDirectAccess = internal::has_direct_access<Derived>::ret,
|
HasDirectAccess = internal::has_direct_access<Derived>::ret,
|
||||||
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
||||||
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|
InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
|
||||||
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|
|| int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
|
||||||
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
|
|| (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
|
||||||
@ -168,8 +168,12 @@ protected:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
|
::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
|
||||||
::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
|
|
||||||
StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());
|
if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
|
||||||
|
::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
|
||||||
|
else
|
||||||
|
::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
|
||||||
|
StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());
|
||||||
}
|
}
|
||||||
|
|
||||||
StrideBase m_stride;
|
StrideBase m_stride;
|
||||||
|
6
Eigen/src/Core/arch/AVX/CMakeLists.txt
Normal file
6
Eigen/src/Core/arch/AVX/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
|
||||||
|
|
||||||
|
INSTALL(FILES
|
||||||
|
${Eigen_Core_arch_AVX_SRCS}
|
||||||
|
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
|
||||||
|
)
|
463
Eigen/src/Core/arch/AVX/Complex.h
Normal file
463
Eigen/src/Core/arch/AVX/Complex.h
Normal file
@ -0,0 +1,463 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_COMPLEX_AVX_H
|
||||||
|
#define EIGEN_COMPLEX_AVX_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
//---------- float ----------
|
||||||
|
struct Packet4cf
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf() {}
|
||||||
|
EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
|
||||||
|
__m256 v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet4cf type;
|
||||||
|
typedef Packet2cf half;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 1,
|
||||||
|
size = 4,
|
||||||
|
HasHalfPacket = 1,
|
||||||
|
|
||||||
|
HasAdd = 1,
|
||||||
|
HasSub = 1,
|
||||||
|
HasMul = 1,
|
||||||
|
HasDiv = 1,
|
||||||
|
HasNegate = 1,
|
||||||
|
HasAbs = 0,
|
||||||
|
HasAbs2 = 0,
|
||||||
|
HasMin = 0,
|
||||||
|
HasMax = 0,
|
||||||
|
HasSetLinear = 0
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
|
||||||
|
{
|
||||||
|
return Packet4cf(pnegate(a.v));
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
|
||||||
|
{
|
||||||
|
const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
|
||||||
|
return Packet4cf(_mm256_xor_ps(a.v,mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
|
||||||
|
{
|
||||||
|
__m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
|
||||||
|
__m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
|
||||||
|
__m256 result = _mm256_addsub_ps(tmp1, tmp2);
|
||||||
|
return Packet4cf(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
|
||||||
|
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
|
||||||
|
{
|
||||||
|
return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
|
||||||
|
{
|
||||||
|
// FIXME The following might be optimized using _mm256_movedup_pd
|
||||||
|
Packet2cf a = ploaddup<Packet2cf>(from);
|
||||||
|
Packet2cf b = ploaddup<Packet2cf>(from+1);
|
||||||
|
return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
|
||||||
|
{
|
||||||
|
return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
|
||||||
|
std::imag(from[2*stride]), std::real(from[2*stride]),
|
||||||
|
std::imag(from[1*stride]), std::real(from[1*stride]),
|
||||||
|
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
|
||||||
|
{
|
||||||
|
__m128 low = _mm256_extractf128_ps(from.v, 0);
|
||||||
|
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
|
||||||
|
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
|
||||||
|
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
|
||||||
|
_mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
|
||||||
|
|
||||||
|
__m128 high = _mm256_extractf128_ps(from.v, 1);
|
||||||
|
to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
|
||||||
|
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
|
||||||
|
to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
|
||||||
|
_mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
|
||||||
|
{
|
||||||
|
return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
|
||||||
|
__m128 low = _mm256_extractf128_ps(a.v, 0);
|
||||||
|
__m128 high = _mm256_extractf128_ps(a.v, 1);
|
||||||
|
__m128d lowd = _mm_castps_pd(low);
|
||||||
|
__m128d highd = _mm_castps_pd(high);
|
||||||
|
low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
|
||||||
|
high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
|
||||||
|
__m256 result = _mm256_setzero_ps();
|
||||||
|
result = _mm256_insertf128_ps(result, low, 1);
|
||||||
|
result = _mm256_insertf128_ps(result, high, 0);
|
||||||
|
return Packet4cf(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
|
||||||
|
{
|
||||||
|
return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
|
||||||
|
Packet2cf(_mm256_extractf128_ps(a.v,1))));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
|
||||||
|
{
|
||||||
|
Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
|
||||||
|
Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
|
||||||
|
t0 = _mm256_hadd_ps(t0,t1);
|
||||||
|
Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
|
||||||
|
Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
|
||||||
|
t2 = _mm256_hadd_ps(t2,t3);
|
||||||
|
|
||||||
|
t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
|
||||||
|
t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
|
||||||
|
|
||||||
|
return Packet4cf(_mm256_add_ps(t1,t3));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
|
||||||
|
{
|
||||||
|
return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
|
||||||
|
Packet2cf(_mm256_extractf128_ps(a.v, 1))));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int Offset>
|
||||||
|
struct palign_impl<Offset,Packet4cf>
|
||||||
|
{
|
||||||
|
static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
|
||||||
|
{
|
||||||
|
if (Offset==0) return;
|
||||||
|
palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
|
||||||
|
{ return padd(pmul(x,y),c); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
|
||||||
|
{
|
||||||
|
return internal::pmul(a, pconj(b));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
|
||||||
|
{ return padd(pmul(x,y),c); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
|
||||||
|
{
|
||||||
|
return internal::pmul(pconj(a), b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
|
||||||
|
{ return padd(pmul(x,y),c); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
|
||||||
|
{
|
||||||
|
return pconj(internal::pmul(a, b));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
|
||||||
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
|
||||||
|
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
|
||||||
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
|
||||||
|
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
|
||||||
|
{
|
||||||
|
Packet4cf num = pmul(a, pconj(b));
|
||||||
|
__m256 tmp = _mm256_mul_ps(b.v, b.v);
|
||||||
|
__m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1);
|
||||||
|
__m256 denom = _mm256_add_ps(tmp, tmp2);
|
||||||
|
return Packet4cf(_mm256_div_ps(num.v, denom));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
|
||||||
|
{
|
||||||
|
return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------- double ----------
|
||||||
|
struct Packet2cd
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd() {}
|
||||||
|
EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
|
||||||
|
__m256d v;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet2cd type;
|
||||||
|
typedef Packet1cd half;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 0,
|
||||||
|
size = 2,
|
||||||
|
HasHalfPacket = 1,
|
||||||
|
|
||||||
|
HasAdd = 1,
|
||||||
|
HasSub = 1,
|
||||||
|
HasMul = 1,
|
||||||
|
HasDiv = 1,
|
||||||
|
HasNegate = 1,
|
||||||
|
HasAbs = 0,
|
||||||
|
HasAbs2 = 0,
|
||||||
|
HasMin = 0,
|
||||||
|
HasMax = 0,
|
||||||
|
HasSetLinear = 0
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
|
||||||
|
{
|
||||||
|
const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
|
||||||
|
return Packet2cd(_mm256_xor_pd(a.v,mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
|
||||||
|
{
|
||||||
|
__m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
|
||||||
|
__m256d even = _mm256_mul_pd(tmp1, b.v);
|
||||||
|
__m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
|
||||||
|
__m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
|
||||||
|
__m256d odd = _mm256_mul_pd(tmp2, tmp3);
|
||||||
|
return Packet2cd(_mm256_addsub_pd(even, odd));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
|
||||||
|
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
|
||||||
|
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
|
||||||
|
{
|
||||||
|
// in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
|
||||||
|
// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
|
||||||
|
return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride)
|
||||||
|
{
|
||||||
|
return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
|
||||||
|
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride)
|
||||||
|
{
|
||||||
|
__m128d low = _mm256_extractf128_pd(from.v, 0);
|
||||||
|
to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
|
||||||
|
__m128d high = _mm256_extractf128_pd(from.v, 1);
|
||||||
|
to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
|
||||||
|
{
|
||||||
|
__m128d low = _mm256_extractf128_pd(a.v, 0);
|
||||||
|
EIGEN_ALIGN16 double res[2];
|
||||||
|
_mm_store_pd(res, low);
|
||||||
|
return std::complex<double>(res[0],res[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
|
||||||
|
__m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
|
||||||
|
return Packet2cd(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
|
||||||
|
{
|
||||||
|
return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
|
||||||
|
Packet1cd(_mm256_extractf128_pd(a.v,1))));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
|
||||||
|
{
|
||||||
|
Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
|
||||||
|
Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
|
||||||
|
|
||||||
|
return Packet2cd(_mm256_add_pd(t0,t1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
|
||||||
|
{
|
||||||
|
return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
|
||||||
|
Packet1cd(_mm256_extractf128_pd(a.v,1))));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int Offset>
|
||||||
|
struct palign_impl<Offset,Packet2cd>
|
||||||
|
{
|
||||||
|
static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
|
||||||
|
{
|
||||||
|
if (Offset==0) return;
|
||||||
|
palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
|
||||||
|
{ return padd(pmul(x,y),c); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
|
||||||
|
{
|
||||||
|
return internal::pmul(a, pconj(b));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
|
||||||
|
{ return padd(pmul(x,y),c); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
|
||||||
|
{
|
||||||
|
return internal::pmul(pconj(a), b);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
|
||||||
|
{ return padd(pmul(x,y),c); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
|
||||||
|
{
|
||||||
|
return pconj(internal::pmul(a, b));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
|
||||||
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
|
||||||
|
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
|
||||||
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
|
||||||
|
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
|
||||||
|
{
|
||||||
|
Packet2cd num = pmul(a, pconj(b));
|
||||||
|
__m256d tmp = _mm256_mul_pd(b.v, b.v);
|
||||||
|
__m256d denom = _mm256_hadd_pd(tmp, tmp);
|
||||||
|
return Packet2cd(_mm256_div_pd(num.v, denom));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
|
||||||
|
{
|
||||||
|
return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4cf,4>& kernel) {
|
||||||
|
__m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
|
||||||
|
__m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
|
||||||
|
__m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
|
||||||
|
__m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
|
||||||
|
|
||||||
|
__m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
|
||||||
|
__m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
|
||||||
|
__m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
|
||||||
|
__m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
|
||||||
|
|
||||||
|
kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
|
||||||
|
kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
|
||||||
|
kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
|
||||||
|
kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet2cd,2>& kernel) {
|
||||||
|
__m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
|
||||||
|
kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
|
||||||
|
kernel.packet[0].v = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end namespace internal
|
||||||
|
|
||||||
|
} // end namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_COMPLEX_AVX_H
|
564
Eigen/src/Core/arch/AVX/PacketMath.h
Normal file
564
Eigen/src/Core/arch/AVX/PacketMath.h
Normal file
@ -0,0 +1,564 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_PACKET_MATH_AVX_H
|
||||||
|
#define EIGEN_PACKET_MATH_AVX_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||||
|
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
||||||
|
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
|
#ifndef EIGEN_HAS_FUSED_MADD
|
||||||
|
#define EIGEN_HAS_FUSED_MADD 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef __m256 Packet8f;
|
||||||
|
typedef __m256i Packet8i;
|
||||||
|
typedef __m256d Packet4d;
|
||||||
|
|
||||||
|
template<> struct is_arithmetic<__m256> { enum { value = true }; };
|
||||||
|
template<> struct is_arithmetic<__m256i> { enum { value = true }; };
|
||||||
|
template<> struct is_arithmetic<__m256d> { enum { value = true }; };
|
||||||
|
|
||||||
|
#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
|
||||||
|
const Packet8f p8f_##NAME = pset1<Packet8f>(X)
|
||||||
|
|
||||||
|
#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
|
||||||
|
const Packet4d p4d_##NAME = pset1<Packet4d>(X)
|
||||||
|
|
||||||
|
|
||||||
|
template<> struct packet_traits<float> : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet8f type;
|
||||||
|
typedef Packet4f half;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 1,
|
||||||
|
size=8,
|
||||||
|
HasHalfPacket = 1,
|
||||||
|
|
||||||
|
HasDiv = 1,
|
||||||
|
HasSin = 0,
|
||||||
|
HasCos = 0,
|
||||||
|
HasLog = 0,
|
||||||
|
HasExp = 0,
|
||||||
|
HasSqrt = 0
|
||||||
|
};
|
||||||
|
};
|
||||||
|
template<> struct packet_traits<double> : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet4d type;
|
||||||
|
typedef Packet2d half;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 1,
|
||||||
|
size=4,
|
||||||
|
HasHalfPacket = 1,
|
||||||
|
|
||||||
|
HasDiv = 1,
|
||||||
|
HasExp = 0
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
|
||||||
|
use SSE instructions and packets to deal with integers.
|
||||||
|
template<> struct packet_traits<int> : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet8i type;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 1,
|
||||||
|
size=8
|
||||||
|
};
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
|
template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8}; };
|
||||||
|
template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
|
||||||
|
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8}; };
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
|
||||||
|
{
|
||||||
|
return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
|
||||||
|
{
|
||||||
|
return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
|
||||||
|
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
|
||||||
|
{ eigen_assert(false && "packet integer division are not supported by AVX");
|
||||||
|
return pset1<Packet8i>(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
|
||||||
|
#if defined(__clang__) || defined(__GNUC__)
|
||||||
|
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
|
||||||
|
// and gcc stupidly generates a vfmadd132ps instruction,
|
||||||
|
// so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
|
||||||
|
// the result of the product.
|
||||||
|
Packet8f res = c;
|
||||||
|
asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
|
||||||
|
return res;
|
||||||
|
#else
|
||||||
|
return _mm256_fmadd_ps(a,b,c);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
|
||||||
|
#if defined(__clang__) || defined(__GNUC__)
|
||||||
|
// see above
|
||||||
|
Packet4d res = c;
|
||||||
|
asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
|
||||||
|
return res;
|
||||||
|
#else
|
||||||
|
return _mm256_fmadd_pd(a,b,c);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
|
||||||
|
|
||||||
|
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
|
||||||
|
{
|
||||||
|
// TODO try to find a way to avoid the need of a temporary register
|
||||||
|
// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
|
||||||
|
// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
|
||||||
|
// return _mm256_unpacklo_ps(tmp,tmp);
|
||||||
|
|
||||||
|
// _mm256_insertf128_ps is very slow on Haswell, thus:
|
||||||
|
Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
|
||||||
|
// mimic an "inplace" permutation of the lower 128bits using a blend
|
||||||
|
tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
|
||||||
|
// then we can perform a consistent permutation on the global register to get everything in shape:
|
||||||
|
return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
|
||||||
|
}
|
||||||
|
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
|
||||||
|
{
|
||||||
|
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
|
||||||
|
return _mm256_permute_pd(tmp, 3<<2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
|
||||||
|
{
|
||||||
|
Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
|
||||||
|
return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
|
||||||
|
|
||||||
|
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
|
||||||
|
// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
|
||||||
|
{
|
||||||
|
return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
|
||||||
|
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
|
||||||
|
{
|
||||||
|
return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride)
|
||||||
|
{
|
||||||
|
__m128 low = _mm256_extractf128_ps(from, 0);
|
||||||
|
to[stride*0] = _mm_cvtss_f32(low);
|
||||||
|
to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
|
||||||
|
to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
|
||||||
|
to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
|
||||||
|
|
||||||
|
__m128 high = _mm256_extractf128_ps(from, 1);
|
||||||
|
to[stride*4] = _mm_cvtss_f32(high);
|
||||||
|
to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
|
||||||
|
to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
|
||||||
|
to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride)
|
||||||
|
{
|
||||||
|
__m128d low = _mm256_extractf128_pd(from, 0);
|
||||||
|
to[stride*0] = _mm_cvtsd_f64(low);
|
||||||
|
to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
|
||||||
|
__m128d high = _mm256_extractf128_pd(from, 1);
|
||||||
|
to[stride*2] = _mm_cvtsd_f64(high);
|
||||||
|
to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
|
||||||
|
{
|
||||||
|
Packet8f pa = pset1<Packet8f>(a);
|
||||||
|
pstore(to, pa);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
|
||||||
|
{
|
||||||
|
Packet4d pa = pset1<Packet4d>(a);
|
||||||
|
pstore(to, pa);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
|
||||||
|
{
|
||||||
|
Packet8i pa = pset1<Packet8i>(a);
|
||||||
|
pstore(to, pa);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
|
||||||
|
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
|
||||||
|
return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
|
||||||
|
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
|
||||||
|
{
|
||||||
|
__m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
|
||||||
|
return _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
|
||||||
|
{
|
||||||
|
__m256d tmp = _mm256_shuffle_pd(a,a,5);
|
||||||
|
return _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||||
|
|
||||||
|
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
|
||||||
|
return _mm256_permute_pd(swap_halves,5);
|
||||||
|
}
|
||||||
|
|
||||||
|
// pabs should be ok
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
|
||||||
|
{
|
||||||
|
const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
|
||||||
|
return _mm256_and_ps(a,mask);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
|
||||||
|
{
|
||||||
|
const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
|
||||||
|
return _mm256_and_pd(a,mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
// preduxp should be ok
|
||||||
|
// FIXME: why is this ok? why isn't the simply implementation working as expected?
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
|
||||||
|
{
|
||||||
|
__m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
|
||||||
|
__m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
|
||||||
|
__m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
|
||||||
|
__m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
|
||||||
|
|
||||||
|
__m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
|
||||||
|
__m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
|
||||||
|
__m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
|
||||||
|
__m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
|
||||||
|
|
||||||
|
__m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
|
||||||
|
__m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
|
||||||
|
__m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
|
||||||
|
__m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
|
||||||
|
|
||||||
|
__m256 sum1 = _mm256_add_ps(perm1, hsum5);
|
||||||
|
__m256 sum2 = _mm256_add_ps(perm2, hsum6);
|
||||||
|
__m256 sum3 = _mm256_add_ps(perm3, hsum7);
|
||||||
|
__m256 sum4 = _mm256_add_ps(perm4, hsum8);
|
||||||
|
|
||||||
|
__m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
|
||||||
|
__m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
|
||||||
|
|
||||||
|
__m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
|
||||||
|
return final;
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
|
||||||
|
{
|
||||||
|
Packet4d tmp0, tmp1;
|
||||||
|
|
||||||
|
tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
|
||||||
|
tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
|
||||||
|
|
||||||
|
tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
|
||||||
|
tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
|
||||||
|
|
||||||
|
return _mm256_blend_pd(tmp0, tmp1, 0xC);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
|
||||||
|
{
|
||||||
|
Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
|
||||||
|
tmp0 = _mm256_hadd_ps(tmp0,tmp0);
|
||||||
|
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
|
||||||
|
{
|
||||||
|
Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
|
||||||
|
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
|
||||||
|
{
|
||||||
|
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
|
||||||
|
{
|
||||||
|
Packet8f tmp;
|
||||||
|
tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
|
||||||
|
tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
|
||||||
|
return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
|
||||||
|
{
|
||||||
|
Packet4d tmp;
|
||||||
|
tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
|
||||||
|
return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
|
||||||
|
{
|
||||||
|
Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
|
||||||
|
tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
|
||||||
|
return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
|
||||||
|
{
|
||||||
|
Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
|
||||||
|
return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
|
||||||
|
{
|
||||||
|
Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
|
||||||
|
tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
|
||||||
|
return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
|
||||||
|
{
|
||||||
|
Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
|
||||||
|
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<int Offset>
|
||||||
|
struct palign_impl<Offset,Packet8f>
|
||||||
|
{
|
||||||
|
static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
|
||||||
|
{
|
||||||
|
if (Offset==1)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 1);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
|
||||||
|
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
|
||||||
|
}
|
||||||
|
else if (Offset==2)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 3);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
|
||||||
|
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
|
||||||
|
}
|
||||||
|
else if (Offset==3)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 7);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
|
||||||
|
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
|
||||||
|
}
|
||||||
|
else if (Offset==4)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 15);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
|
||||||
|
first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
|
||||||
|
}
|
||||||
|
else if (Offset==5)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 31);
|
||||||
|
first = _mm256_permute2f128_ps(first, first, 1);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
|
||||||
|
first = _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||||
|
first = _mm256_blend_ps(tmp, first, 0x88);
|
||||||
|
}
|
||||||
|
else if (Offset==6)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 63);
|
||||||
|
first = _mm256_permute2f128_ps(first, first, 1);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
|
||||||
|
first = _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||||
|
first = _mm256_blend_ps(tmp, first, 0xcc);
|
||||||
|
}
|
||||||
|
else if (Offset==7)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_ps(first, second, 127);
|
||||||
|
first = _mm256_permute2f128_ps(first, first, 1);
|
||||||
|
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
|
||||||
|
first = _mm256_permute2f128_ps(tmp, tmp, 1);
|
||||||
|
first = _mm256_blend_ps(tmp, first, 0xee);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int Offset>
|
||||||
|
struct palign_impl<Offset,Packet4d>
|
||||||
|
{
|
||||||
|
static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
|
||||||
|
{
|
||||||
|
if (Offset==1)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_pd(first, second, 1);
|
||||||
|
__m256d tmp = _mm256_permute_pd(first, 5);
|
||||||
|
first = _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||||
|
first = _mm256_blend_pd(tmp, first, 0xA);
|
||||||
|
}
|
||||||
|
else if (Offset==2)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_pd(first, second, 3);
|
||||||
|
first = _mm256_permute2f128_pd(first, first, 1);
|
||||||
|
}
|
||||||
|
else if (Offset==3)
|
||||||
|
{
|
||||||
|
first = _mm256_blend_pd(first, second, 7);
|
||||||
|
__m256d tmp = _mm256_permute_pd(first, 5);
|
||||||
|
first = _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||||
|
first = _mm256_blend_pd(tmp, first, 5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet8f,8>& kernel) {
|
||||||
|
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
|
||||||
|
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
|
||||||
|
__m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
|
||||||
|
__m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
|
||||||
|
__m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
|
||||||
|
__m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
|
||||||
|
__m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
|
||||||
|
__m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
|
||||||
|
__m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
|
||||||
|
__m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
|
||||||
|
__m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
|
||||||
|
__m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
|
||||||
|
__m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
|
||||||
|
__m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
|
||||||
|
kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
|
||||||
|
kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
|
||||||
|
kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
|
||||||
|
kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
|
||||||
|
kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
|
||||||
|
kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
|
||||||
|
kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
|
||||||
|
kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet8f,4>& kernel) {
|
||||||
|
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
|
||||||
|
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
|
||||||
|
|
||||||
|
__m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
|
||||||
|
__m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
|
||||||
|
__m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
|
||||||
|
__m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
|
||||||
|
|
||||||
|
kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
|
||||||
|
kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
|
||||||
|
kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
|
||||||
|
kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4d,4>& kernel) {
|
||||||
|
__m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
|
||||||
|
__m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
|
||||||
|
__m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
|
||||||
|
__m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
|
||||||
|
|
||||||
|
kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
|
||||||
|
kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
|
||||||
|
kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
|
||||||
|
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end namespace internal
|
||||||
|
|
||||||
|
} // end namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_PACKET_MATH_AVX_H
|
@ -21,6 +21,8 @@ static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);
|
|||||||
static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
||||||
static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
|
static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
|
||||||
static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
|
static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
|
||||||
|
static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
|
||||||
|
static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
|
||||||
|
|
||||||
//---------- float ----------
|
//---------- float ----------
|
||||||
struct Packet2cf
|
struct Packet2cf
|
||||||
@ -33,6 +35,7 @@ struct Packet2cf
|
|||||||
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet2cf type;
|
typedef Packet2cf type;
|
||||||
|
typedef Packet2cf half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
@ -51,7 +54,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
|
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||||
{
|
{
|
||||||
@ -65,6 +68,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
|
||||||
|
{
|
||||||
|
std::complex<float> EIGEN_ALIGN16 af[2];
|
||||||
|
af[0] = from[0*stride];
|
||||||
|
af[1] = from[1*stride];
|
||||||
|
return Packet2cf(vec_ld(0, (const float*)af));
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
|
||||||
|
{
|
||||||
|
std::complex<float> EIGEN_ALIGN16 af[2];
|
||||||
|
vec_st(from.v, 0, (float*)af);
|
||||||
|
to[0*stride] = af[0];
|
||||||
|
to[1*stride] = af[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
|
||||||
@ -210,6 +229,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x
|
|||||||
return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
|
return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
|
||||||
|
{
|
||||||
|
Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0);
|
||||||
|
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1);
|
||||||
|
kernel.packet[0].v = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -18,6 +18,10 @@ namespace internal {
|
|||||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
|
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef EIGEN_HAS_FUSED_MADD
|
||||||
|
#define EIGEN_HAS_FUSED_MADD 1
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef EIGEN_HAS_FUSE_CJMADD
|
#ifndef EIGEN_HAS_FUSE_CJMADD
|
||||||
#define EIGEN_HAS_FUSE_CJMADD 1
|
#define EIGEN_HAS_FUSE_CJMADD 1
|
||||||
#endif
|
#endif
|
||||||
@ -73,6 +77,7 @@ static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)
|
|||||||
template<> struct packet_traits<float> : default_packet_traits
|
template<> struct packet_traits<float> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4f type;
|
typedef Packet4f type;
|
||||||
|
typedef Packet4f half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
@ -89,6 +94,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
|||||||
template<> struct packet_traits<int> : default_packet_traits
|
template<> struct packet_traits<int> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4i type;
|
typedef Packet4i type;
|
||||||
|
typedef Packet4i half;
|
||||||
enum {
|
enum {
|
||||||
// FIXME check the Has*
|
// FIXME check the Has*
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
@ -97,8 +103,8 @@ template<> struct packet_traits<int> : default_packet_traits
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
|
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
|
||||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
|
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
|
||||||
/*
|
/*
|
||||||
inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
|
inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
|
||||||
{
|
{
|
||||||
@ -144,6 +150,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
|
||||||
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
|
||||||
float EIGEN_ALIGN16 af[4];
|
float EIGEN_ALIGN16 af[4];
|
||||||
@ -161,6 +168,65 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
|
|||||||
return vc;
|
return vc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE void
|
||||||
|
pbroadcast4<Packet4f>(const float *a,
|
||||||
|
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
||||||
|
{
|
||||||
|
a3 = vec_ld(0,a);
|
||||||
|
a0 = vec_splat(a3, 0);
|
||||||
|
a1 = vec_splat(a3, 1);
|
||||||
|
a2 = vec_splat(a3, 2);
|
||||||
|
a3 = vec_splat(a3, 3);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void
|
||||||
|
pbroadcast4<Packet4i>(const int *a,
|
||||||
|
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
|
||||||
|
{
|
||||||
|
a3 = vec_ld(0,a);
|
||||||
|
a0 = vec_splat(a3, 0);
|
||||||
|
a1 = vec_splat(a3, 1);
|
||||||
|
a2 = vec_splat(a3, 2);
|
||||||
|
a3 = vec_splat(a3, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
|
||||||
|
{
|
||||||
|
float EIGEN_ALIGN16 af[4];
|
||||||
|
af[0] = from[0*stride];
|
||||||
|
af[1] = from[1*stride];
|
||||||
|
af[2] = from[2*stride];
|
||||||
|
af[3] = from[3*stride];
|
||||||
|
return vec_ld(0, af);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
|
||||||
|
{
|
||||||
|
int EIGEN_ALIGN16 ai[4];
|
||||||
|
ai[0] = from[0*stride];
|
||||||
|
ai[1] = from[1*stride];
|
||||||
|
ai[2] = from[2*stride];
|
||||||
|
ai[3] = from[3*stride];
|
||||||
|
return vec_ld(0, ai);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
|
||||||
|
{
|
||||||
|
float EIGEN_ALIGN16 af[4];
|
||||||
|
vec_st(from, 0, af);
|
||||||
|
to[0*stride] = af[0];
|
||||||
|
to[1*stride] = af[1];
|
||||||
|
to[2*stride] = af[2];
|
||||||
|
to[3*stride] = af[3];
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
|
||||||
|
{
|
||||||
|
int EIGEN_ALIGN16 ai[4];
|
||||||
|
vec_st(from, 0, ai);
|
||||||
|
to[0*stride] = ai[0];
|
||||||
|
to[1*stride] = ai[1];
|
||||||
|
to[2*stride] = ai[2];
|
||||||
|
to[3*stride] = ai[3];
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
|
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
|
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
|
||||||
|
|
||||||
@ -494,6 +560,32 @@ struct palign_impl<Offset,Packet4i>
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||||
|
Packet4f t0, t1, t2, t3;
|
||||||
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
||||||
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
||||||
|
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
||||||
|
t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
||||||
|
kernel.packet[0] = vec_mergeh(t0, t2);
|
||||||
|
kernel.packet[1] = vec_mergel(t0, t2);
|
||||||
|
kernel.packet[2] = vec_mergeh(t1, t3);
|
||||||
|
kernel.packet[3] = vec_mergel(t1, t3);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||||
|
Packet4i t0, t1, t2, t3;
|
||||||
|
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
|
||||||
|
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
|
||||||
|
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
|
||||||
|
t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
|
||||||
|
kernel.packet[0] = vec_mergeh(t0, t2);
|
||||||
|
kernel.packet[1] = vec_mergel(t0, t2);
|
||||||
|
kernel.packet[2] = vec_mergeh(t1, t3);
|
||||||
|
kernel.packet[3] = vec_mergel(t1, t3);
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
ADD_SUBDIRECTORY(SSE)
|
ADD_SUBDIRECTORY(SSE)
|
||||||
ADD_SUBDIRECTORY(AltiVec)
|
ADD_SUBDIRECTORY(AltiVec)
|
||||||
ADD_SUBDIRECTORY(NEON)
|
ADD_SUBDIRECTORY(NEON)
|
||||||
|
ADD_SUBDIRECTORY(AVX)
|
||||||
ADD_SUBDIRECTORY(Default)
|
ADD_SUBDIRECTORY(Default)
|
||||||
|
@ -28,6 +28,7 @@ struct Packet2cf
|
|||||||
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet2cf type;
|
typedef Packet2cf type;
|
||||||
|
typedef Packet2cf half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
@ -46,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
|
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||||
{
|
{
|
||||||
@ -110,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
|
|||||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
|
||||||
|
{
|
||||||
|
Packet4f res;
|
||||||
|
res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
|
||||||
|
res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
|
||||||
|
res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
|
||||||
|
res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
|
||||||
|
return Packet2cf(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
|
||||||
|
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
|
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
||||||
@ -246,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
|
|||||||
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
|
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet2cf,2>& kernel) {
|
||||||
|
float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
|
||||||
|
kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
|
||||||
|
kernel.packet[1].v = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -66,6 +66,7 @@ typedef uint32x4_t Packet4ui;
|
|||||||
template<> struct packet_traits<float> : default_packet_traits
|
template<> struct packet_traits<float> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4f type;
|
typedef Packet4f type;
|
||||||
|
typedef Packet4f half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
@ -83,6 +84,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
|||||||
template<> struct packet_traits<int> : default_packet_traits
|
template<> struct packet_traits<int> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4i type;
|
typedef Packet4i type;
|
||||||
|
typedef Packet4i half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
@ -95,12 +97,13 @@ template<> struct packet_traits<int> : default_packet_traits
|
|||||||
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
|
// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
|
||||||
EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
|
EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
|
||||||
EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
|
EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
|
||||||
|
EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
|
||||||
EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
|
EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
|
||||||
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
|
EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
|
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
|
||||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
|
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
|
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
|
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
|
||||||
@ -219,6 +222,40 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& f
|
|||||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
|
||||||
|
{
|
||||||
|
Packet4f res;
|
||||||
|
res = vsetq_lane_f32(from[0*stride], res, 0);
|
||||||
|
res = vsetq_lane_f32(from[1*stride], res, 1);
|
||||||
|
res = vsetq_lane_f32(from[2*stride], res, 2);
|
||||||
|
res = vsetq_lane_f32(from[3*stride], res, 3);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
|
||||||
|
{
|
||||||
|
Packet4i res;
|
||||||
|
res = vsetq_lane_s32(from[0*stride], res, 0);
|
||||||
|
res = vsetq_lane_s32(from[1*stride], res, 1);
|
||||||
|
res = vsetq_lane_s32(from[2*stride], res, 2);
|
||||||
|
res = vsetq_lane_s32(from[3*stride], res, 3);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = vgetq_lane_f32(from, 0);
|
||||||
|
to[stride*1] = vgetq_lane_f32(from, 1);
|
||||||
|
to[stride*2] = vgetq_lane_f32(from, 2);
|
||||||
|
to[stride*3] = vgetq_lane_f32(from, 3);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = vgetq_lane_s32(from, 0);
|
||||||
|
to[stride*1] = vgetq_lane_s32(from, 1);
|
||||||
|
to[stride*2] = vgetq_lane_s32(from, 2);
|
||||||
|
to[stride*3] = vgetq_lane_s32(from, 3);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
|
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); }
|
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||||
|
|
||||||
@ -385,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
|||||||
a_lo = vget_low_s32(a);
|
a_lo = vget_low_s32(a);
|
||||||
a_hi = vget_high_s32(a);
|
a_hi = vget_high_s32(a);
|
||||||
max = vpmax_s32(a_lo, a_hi);
|
max = vpmax_s32(a_lo, a_hi);
|
||||||
|
max = vpmax_s32(max, max);
|
||||||
|
|
||||||
return vget_lane_s32(max, 0);
|
return vget_lane_s32(max, 0);
|
||||||
}
|
}
|
||||||
@ -413,6 +451,27 @@ PALIGN_NEON(3,Packet4i,vextq_s32)
|
|||||||
|
|
||||||
#undef PALIGN_NEON
|
#undef PALIGN_NEON
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||||
|
float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
|
||||||
|
kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
|
||||||
|
kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
|
||||||
|
kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
|
||||||
|
kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||||
|
int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
|
||||||
|
kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
|
||||||
|
kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
|
||||||
|
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -22,13 +22,18 @@ struct Packet2cf
|
|||||||
__m128 v;
|
__m128 v;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
|
||||||
|
// to leverage AVX instructions.
|
||||||
|
#ifndef EIGEN_VECTORIZE_AVX
|
||||||
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet2cf type;
|
typedef Packet2cf type;
|
||||||
|
typedef Packet2cf half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
size = 2,
|
size = 2,
|
||||||
|
HasHalfPacket = 0,
|
||||||
|
|
||||||
HasAdd = 1,
|
HasAdd = 1,
|
||||||
HasSub = 1,
|
HasSub = 1,
|
||||||
@ -42,8 +47,9 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
|||||||
HasSetLinear = 0
|
HasSetLinear = 0
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
|
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
|
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
|
||||||
@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
|
template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
|
||||||
|
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
|
||||||
|
{
|
||||||
|
return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
|
||||||
|
std::imag(from[0*stride]), std::real(from[0*stride])));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
|
||||||
|
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
|
||||||
|
to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
|
||||||
|
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
|
||||||
@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
|
template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
|
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
|
||||||
{
|
{
|
||||||
@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
|
|||||||
{ return padd(c, pmul(x,y)); }
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
|
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
|
||||||
{ return Packet2cf(Eigen::internal::pmul(x, y.v)); }
|
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
||||||
@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
|||||||
{ return padd(c, pmul(x,y)); }
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
|
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
|
||||||
{ return Packet2cf(Eigen::internal::pmul(x.v, y)); }
|
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||||
@ -248,13 +269,18 @@ struct Packet1cd
|
|||||||
__m128d v;
|
__m128d v;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
|
||||||
|
// to leverage AVX instructions.
|
||||||
|
#ifndef EIGEN_VECTORIZE_AVX
|
||||||
template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet1cd type;
|
typedef Packet1cd type;
|
||||||
|
typedef Packet1cd half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 0,
|
AlignedOnScalar = 0,
|
||||||
size = 1,
|
size = 1,
|
||||||
|
HasHalfPacket = 0,
|
||||||
|
|
||||||
HasAdd = 1,
|
HasAdd = 1,
|
||||||
HasSub = 1,
|
HasSub = 1,
|
||||||
@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
|
|||||||
HasSetLinear = 0
|
HasSetLinear = 0
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
|
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
|
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
|
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
|
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
|
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
|
||||||
{
|
{
|
||||||
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
||||||
@ -311,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
|
|||||||
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
|
template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
|
||||||
|
|
||||||
// FIXME force unaligned store, this is a temporary fix
|
// FIXME force unaligned store, this is a temporary fix
|
||||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
|
||||||
@ -410,7 +437,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
|
|||||||
{ return padd(c, pmul(x,y)); }
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
|
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
|
||||||
{ return Packet1cd(Eigen::internal::pmul(x, y.v)); }
|
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
||||||
@ -419,7 +446,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
|||||||
{ return padd(c, pmul(x,y)); }
|
{ return padd(c, pmul(x,y)); }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
|
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
|
||||||
{ return Packet1cd(Eigen::internal::pmul(x.v, y)); }
|
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||||
@ -432,7 +459,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
|
EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
|
||||||
{
|
{
|
||||||
return Packet1cd(preverse(x.v));
|
return Packet1cd(preverse(Packet2d(x.v)));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet2cf,2>& kernel) {
|
||||||
|
__m128d w1 = _mm_castps_pd(kernel.packet[0].v);
|
||||||
|
__m128d w2 = _mm_castps_pd(kernel.packet[1].v);
|
||||||
|
|
||||||
|
__m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
|
||||||
|
kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
|
||||||
|
kernel.packet[1].v = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
|
|||||||
x = _mm_or_ps(x, p4f_half);
|
x = _mm_or_ps(x, p4f_half);
|
||||||
|
|
||||||
emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
|
emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
|
||||||
Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
|
Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
|
||||||
|
|
||||||
/* part2:
|
/* part2:
|
||||||
if( x < SQRTHF ) {
|
if( x < SQRTHF ) {
|
||||||
@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
|
|||||||
} else { x = x - 1.0; }
|
} else { x = x - 1.0; }
|
||||||
*/
|
*/
|
||||||
Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
|
Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
|
||||||
Packet4f tmp = _mm_and_ps(x, mask);
|
Packet4f tmp = pand(x, mask);
|
||||||
x = psub(x, p4f_1);
|
x = psub(x, p4f_1);
|
||||||
e = psub(e, _mm_and_ps(p4f_1, mask));
|
e = psub(e, pand(p4f_1, mask));
|
||||||
x = padd(x, tmp);
|
x = padd(x, tmp);
|
||||||
|
|
||||||
Packet4f x2 = pmul(x,x);
|
Packet4f x2 = pmul(x,x);
|
||||||
@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
|
|||||||
emm0 = _mm_cvttps_epi32(fx);
|
emm0 = _mm_cvttps_epi32(fx);
|
||||||
emm0 = _mm_add_epi32(emm0, p4i_0x7f);
|
emm0 = _mm_add_epi32(emm0, p4i_0x7f);
|
||||||
emm0 = _mm_slli_epi32(emm0, 23);
|
emm0 = _mm_slli_epi32(emm0, 23);
|
||||||
return pmul(y, _mm_castsi128_ps(emm0));
|
return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||||
Packet2d pexp<Packet2d>(const Packet2d& _x)
|
Packet2d pexp<Packet2d>(const Packet2d& _x)
|
||||||
@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
|
|||||||
emm0 = _mm_add_epi32(emm0, p4i_1023_0);
|
emm0 = _mm_add_epi32(emm0, p4i_1023_0);
|
||||||
emm0 = _mm_slli_epi32(emm0, 20);
|
emm0 = _mm_slli_epi32(emm0, 20);
|
||||||
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
|
emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
|
||||||
return pmul(x, _mm_castsi128_pd(emm0));
|
return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
|
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
|
||||||
|
209
Eigen/src/Core/arch/SSE/PacketMath.h
Normal file → Executable file
209
Eigen/src/Core/arch/SSE/PacketMath.h
Normal file → Executable file
@ -22,9 +22,41 @@ namespace internal {
|
|||||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
|
#ifndef EIGEN_HAS_FUSED_MADD
|
||||||
|
#define EIGEN_HAS_FUSED_MADD 1
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
|
||||||
|
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
|
||||||
|
// have overloads for both types without linking error.
|
||||||
|
// One solution is to increase ABI version using -fabi-version=4 (or greater).
|
||||||
|
// To workaround this inconvenince, we rather wrap 128bit types into the following helper
|
||||||
|
// structure:
|
||||||
|
// TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
|
||||||
|
template<typename T>
|
||||||
|
struct eigen_packet_wrapper
|
||||||
|
{
|
||||||
|
EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
|
||||||
|
EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
|
||||||
|
EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
|
||||||
|
EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
|
||||||
|
EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
|
||||||
|
m_val = v;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
T m_val;
|
||||||
|
};
|
||||||
|
typedef eigen_packet_wrapper<__m128> Packet4f;
|
||||||
|
typedef eigen_packet_wrapper<__m128i> Packet4i;
|
||||||
|
typedef eigen_packet_wrapper<__m128d> Packet2d;
|
||||||
|
#else
|
||||||
typedef __m128 Packet4f;
|
typedef __m128 Packet4f;
|
||||||
typedef __m128i Packet4i;
|
typedef __m128i Packet4i;
|
||||||
typedef __m128d Packet2d;
|
typedef __m128d Packet2d;
|
||||||
|
#endif
|
||||||
|
|
||||||
template<> struct is_arithmetic<__m128> { enum { value = true }; };
|
template<> struct is_arithmetic<__m128> { enum { value = true }; };
|
||||||
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
|
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
|
||||||
@ -58,13 +90,18 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
|
|||||||
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
||||||
|
|
||||||
|
|
||||||
|
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
|
||||||
|
// to leverage AVX instructions.
|
||||||
|
#ifndef EIGEN_VECTORIZE_AVX
|
||||||
template<> struct packet_traits<float> : default_packet_traits
|
template<> struct packet_traits<float> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4f type;
|
typedef Packet4f type;
|
||||||
|
typedef Packet4f half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
size=4,
|
size=4,
|
||||||
|
HasHalfPacket = 0,
|
||||||
|
|
||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
HasSin = EIGEN_FAST_MATH,
|
HasSin = EIGEN_FAST_MATH,
|
||||||
@ -77,19 +114,23 @@ template<> struct packet_traits<float> : default_packet_traits
|
|||||||
template<> struct packet_traits<double> : default_packet_traits
|
template<> struct packet_traits<double> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet2d type;
|
typedef Packet2d type;
|
||||||
|
typedef Packet2d half;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
AlignedOnScalar = 1,
|
AlignedOnScalar = 1,
|
||||||
size=2,
|
size=2,
|
||||||
|
HasHalfPacket = 0,
|
||||||
|
|
||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
HasExp = 1,
|
HasExp = 1,
|
||||||
HasSqrt = 1
|
HasSqrt = 1
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
template<> struct packet_traits<int> : default_packet_traits
|
template<> struct packet_traits<int> : default_packet_traits
|
||||||
{
|
{
|
||||||
typedef Packet4i type;
|
typedef Packet4i type;
|
||||||
|
typedef Packet4i half;
|
||||||
enum {
|
enum {
|
||||||
// FIXME check the Has*
|
// FIXME check the Has*
|
||||||
Vectorizable = 1,
|
Vectorizable = 1,
|
||||||
@ -98,9 +139,9 @@ template<> struct packet_traits<int> : default_packet_traits
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
|
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; typedef Packet4f half; };
|
||||||
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
|
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
|
||||||
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
|
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; typedef Packet4i half; };
|
||||||
|
|
||||||
#if defined(_MSC_VER) && (_MSC_VER==1500)
|
#if defined(_MSC_VER) && (_MSC_VER==1500)
|
||||||
// Workaround MSVC 9 internal compiler error.
|
// Workaround MSVC 9 internal compiler error.
|
||||||
@ -110,13 +151,26 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re
|
|||||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
|
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
|
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
|
||||||
#else
|
#else
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); }
|
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
|
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
|
||||||
|
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
|
||||||
|
// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
|
||||||
|
// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
|
||||||
|
// Also note that with AVX, we want it to generate a vbroadcastss.
|
||||||
|
#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
|
||||||
|
return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef EIGEN_VECTORIZE_AVX
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
|
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
|
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
|
||||||
|
#endif
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
|
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
|
||||||
@ -139,7 +193,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
|
|||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
|
||||||
{
|
{
|
||||||
return psub(_mm_setr_epi32(0,0,0,0), a);
|
return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
|
template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
|
||||||
@ -173,6 +227,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
|
|||||||
|
|
||||||
// for some weird raisons, it has to be overloaded for packet of integers
|
// for some weird raisons, it has to be overloaded for packet of integers
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
|
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
|
||||||
|
#ifdef EIGEN_VECTORIZE_FMA
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
|
||||||
|
#endif
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
|
||||||
@ -218,7 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
|
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
||||||
@ -236,7 +294,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
|
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||||
#else
|
#else
|
||||||
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
|
// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
|
||||||
// require pointer casting to incompatible pointer types and leads to invalid code
|
// require pointer casting to incompatible pointer types and leads to invalid code
|
||||||
@ -245,14 +303,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { E
|
|||||||
// TODO: do the same for MSVC (ICC is compatible)
|
// TODO: do the same for MSVC (ICC is compatible)
|
||||||
// NOTE: with the code below, MSVC's compiler crashes!
|
// NOTE: with the code below, MSVC's compiler crashes!
|
||||||
|
|
||||||
#if defined(__GNUC__) && defined(__i386__)
|
#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8)))
|
||||||
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
|
// bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
|
||||||
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
|
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
|
||||||
|
#define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
|
||||||
#elif defined(__clang__)
|
#elif defined(__clang__)
|
||||||
// bug 201: Segfaults in __mm_loadh_pd with clang 2.8
|
// bug 201: Segfaults in __mm_loadh_pd with clang 2.8
|
||||||
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
|
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
|
||||||
|
#define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
|
||||||
#else
|
#else
|
||||||
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
|
#define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
|
||||||
|
#define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
|
||||||
@ -283,7 +344,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
|||||||
{
|
{
|
||||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
|
#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
|
||||||
return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
|
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
|
||||||
#else
|
#else
|
||||||
__m128d res;
|
__m128d res;
|
||||||
res = _mm_load_sd((const double*)(from)) ;
|
res = _mm_load_sd((const double*)(from)) ;
|
||||||
@ -302,38 +363,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
|||||||
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
||||||
{
|
{
|
||||||
Packet4i tmp;
|
Packet4i tmp;
|
||||||
tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
|
tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
|
||||||
return vec4i_swizzle1(tmp, 0, 0, 1, 1);
|
return vec4i_swizzle1(tmp, 0, 0, 1, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
|
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
|
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
|
||||||
EIGEN_DEBUG_UNALIGNED_STORE
|
EIGEN_DEBUG_UNALIGNED_STORE
|
||||||
|
#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
|
||||||
|
_mm_storeu_pd(to, from);
|
||||||
|
#else
|
||||||
_mm_storel_pd((to), from);
|
_mm_storel_pd((to), from);
|
||||||
_mm_storeh_pd((to+1), from);
|
_mm_storeh_pd((to+1), from);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
|
||||||
|
{
|
||||||
|
return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
|
||||||
|
{
|
||||||
|
return _mm_set_pd(from[1*stride], from[0*stride]);
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
|
||||||
|
{
|
||||||
|
return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = _mm_cvtss_f32(from);
|
||||||
|
to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
|
||||||
|
to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
|
||||||
|
to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = _mm_cvtsd_f64(from);
|
||||||
|
to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
|
||||||
|
}
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
|
||||||
|
{
|
||||||
|
to[stride*0] = _mm_cvtsi128_si32(from);
|
||||||
|
to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
|
||||||
|
to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
|
||||||
|
to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
|
|
||||||
|
|
||||||
// some compilers might be tempted to perform multiple moves instead of using a vector path.
|
// some compilers might be tempted to perform multiple moves instead of using a vector path.
|
||||||
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
|
||||||
{
|
{
|
||||||
Packet4f pa = _mm_set_ss(a);
|
Packet4f pa = _mm_set_ss(a);
|
||||||
pstore(to, vec4f_swizzle1(pa,0,0,0,0));
|
pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
|
||||||
}
|
}
|
||||||
// some compilers might be tempted to perform multiple moves instead of using a vector path.
|
// some compilers might be tempted to perform multiple moves instead of using a vector path.
|
||||||
template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
|
||||||
{
|
{
|
||||||
Packet2d pa = _mm_set_sd(a);
|
Packet2d pa = _mm_set_sd(a);
|
||||||
pstore(to, vec2d_swizzle1(pa,0,0));
|
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef EIGEN_VECTORIZE_AVX
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
|
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
|
||||||
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
|
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
|
||||||
@ -380,6 +480,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// with AVX, the default implementations based on pload1 are faster
|
||||||
|
#ifndef __AVX__
|
||||||
|
template<> EIGEN_STRONG_INLINE void
|
||||||
|
pbroadcast4<Packet4f>(const float *a,
|
||||||
|
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
||||||
|
{
|
||||||
|
a3 = pload<Packet4f>(a);
|
||||||
|
a0 = vec4f_swizzle1(a3, 0,0,0,0);
|
||||||
|
a1 = vec4f_swizzle1(a3, 1,1,1,1);
|
||||||
|
a2 = vec4f_swizzle1(a3, 2,2,2,2);
|
||||||
|
a3 = vec4f_swizzle1(a3, 3,3,3,3);
|
||||||
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE void
|
||||||
|
pbroadcast4<Packet2d>(const double *a,
|
||||||
|
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
|
||||||
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
a0 = _mm_loaddup_pd(a+0);
|
||||||
|
a1 = _mm_loaddup_pd(a+1);
|
||||||
|
a2 = _mm_loaddup_pd(a+2);
|
||||||
|
a3 = _mm_loaddup_pd(a+3);
|
||||||
|
#else
|
||||||
|
a1 = pload<Packet2d>(a);
|
||||||
|
a0 = vec2d_swizzle1(a1, 0,0);
|
||||||
|
a1 = vec2d_swizzle1(a1, 1,1);
|
||||||
|
a3 = pload<Packet2d>(a+2);
|
||||||
|
a2 = vec2d_swizzle1(a3, 0,0);
|
||||||
|
a3 = vec2d_swizzle1(a3, 1,1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
|
EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
|
||||||
{
|
{
|
||||||
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
|
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
|
||||||
@ -407,10 +539,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
|||||||
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
Packet4f tmp0 = _mm_hadd_ps(a,a);
|
Packet4f tmp0 = _mm_hadd_ps(a,a);
|
||||||
return pfirst(_mm_hadd_ps(tmp0, tmp0));
|
return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
|
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
|
||||||
|
|
||||||
// SSSE3 version:
|
// SSSE3 version:
|
||||||
// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
|
// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
|
||||||
@ -453,7 +585,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
|||||||
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
|
||||||
{
|
{
|
||||||
Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
|
Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
|
||||||
return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
|
return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
|
template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
|
||||||
@ -476,11 +608,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
|
|||||||
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
|
Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
|
||||||
return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
|
return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
|
||||||
{
|
{
|
||||||
return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
|
return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
||||||
{
|
{
|
||||||
@ -496,17 +628,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
|||||||
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
|
Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
|
||||||
return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
|
return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
|
||||||
{
|
{
|
||||||
return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
|
return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
|
Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
|
||||||
return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
|
return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
|
||||||
#else
|
#else
|
||||||
// after some experiments, it is seems this is the fastest way to implement it
|
// after some experiments, it is seems this is the fastest way to implement it
|
||||||
// for GCC (eg., it does not like using std::min after the pstore !!)
|
// for GCC (eg., it does not like using std::min after the pstore !!)
|
||||||
@ -522,17 +654,17 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
|||||||
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
|
Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
|
||||||
return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
|
return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
||||||
{
|
{
|
||||||
return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
|
return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
|
||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
|
Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
|
||||||
return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
|
return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
|
||||||
#else
|
#else
|
||||||
// after some experiments, it is seems this is the fastest way to implement it
|
// after some experiments, it is seems this is the fastest way to implement it
|
||||||
// for GCC (eg., it does not like using std::min after the pstore !!)
|
// for GCC (eg., it does not like using std::min after the pstore !!)
|
||||||
@ -652,6 +784,31 @@ struct palign_impl<Offset,Packet2d>
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||||
|
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||||
|
__m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
|
||||||
|
kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
|
||||||
|
kernel.packet[1] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||||
|
__m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
__m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
|
||||||
|
kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
|
||||||
|
kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
|
||||||
|
kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
|
||||||
|
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -320,6 +320,26 @@ struct functor_traits<scalar_asin_op<Scalar> >
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/** \internal
|
||||||
|
* \brief Template functor to compute the atan of a scalar
|
||||||
|
* \sa class CwiseUnaryOp, ArrayBase::atan()
|
||||||
|
*/
|
||||||
|
template<typename Scalar> struct scalar_atan_op {
|
||||||
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
|
||||||
|
inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
|
||||||
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
|
inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
|
||||||
|
};
|
||||||
|
template<typename Scalar>
|
||||||
|
struct functor_traits<scalar_atan_op<Scalar> >
|
||||||
|
{
|
||||||
|
enum {
|
||||||
|
Cost = 5 * NumTraits<Scalar>::MulCost,
|
||||||
|
PacketAccess = packet_traits<Scalar>::HasATan
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
/** \internal
|
/** \internal
|
||||||
* \brief Template functor to compute the inverse of a scalar
|
* \brief Template functor to compute the inverse of a scalar
|
||||||
* \sa class CwiseUnaryOp, Cwise::inverse()
|
* \sa class CwiseUnaryOp, Cwise::inverse()
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -23,6 +23,8 @@ template<
|
|||||||
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
|
||||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
||||||
{
|
{
|
||||||
|
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
|
||||||
|
|
||||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||||
static EIGEN_STRONG_INLINE void run(
|
static EIGEN_STRONG_INLINE void run(
|
||||||
Index rows, Index cols, Index depth,
|
Index rows, Index cols, Index depth,
|
||||||
@ -51,6 +53,8 @@ template<
|
|||||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
|
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
|
||||||
{
|
{
|
||||||
|
|
||||||
|
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||||
|
|
||||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||||
static void run(Index rows, Index cols, Index depth,
|
static void run(Index rows, Index cols, Index depth,
|
||||||
const LhsScalar* _lhs, Index lhsStride,
|
const LhsScalar* _lhs, Index lhsStride,
|
||||||
@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||||
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||||
|
|
||||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
|
||||||
|
|
||||||
Index kc = blocking.kc(); // cache block size along the K direction
|
Index kc = blocking.kc(); // cache block size along the K direction
|
||||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||||
//Index nc = blocking.nc(); // cache block size along the N direction
|
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
|
||||||
|
|
||||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||||
@ -80,13 +82,11 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
Index tid = omp_get_thread_num();
|
Index tid = omp_get_thread_num();
|
||||||
Index threads = omp_get_num_threads();
|
Index threads = omp_get_num_threads();
|
||||||
|
|
||||||
std::size_t sizeA = kc*mc;
|
LhsScalar* blockA = blocking.blockA();
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
eigen_internal_assert(blockA!=0);
|
||||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
|
|
||||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
|
|
||||||
|
|
||||||
RhsScalar* blockB = blocking.blockB();
|
std::size_t sizeB = kc*nc;
|
||||||
eigen_internal_assert(blockB!=0);
|
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
|
||||||
|
|
||||||
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
|
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
|
||||||
for(Index k=0; k<depth; k+=kc)
|
for(Index k=0; k<depth; k+=kc)
|
||||||
@ -94,54 +94,56 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
|
const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
|
||||||
|
|
||||||
// In order to reduce the chance that a thread has to wait for the other,
|
// In order to reduce the chance that a thread has to wait for the other,
|
||||||
// let's start by packing A'.
|
// let's start by packing B'.
|
||||||
pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
|
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
|
||||||
|
|
||||||
// Pack B_k to B' in a parallel fashion:
|
// Pack A_k to A' in a parallel fashion:
|
||||||
// each thread packs the sub block B_k,j to B'_j where j is the thread id.
|
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
|
||||||
|
|
||||||
// However, before copying to B'_j, we have to make sure that no other thread is still using it,
|
// However, before copying to A'_i, we have to make sure that no other thread is still using it,
|
||||||
// i.e., we test that info[tid].users equals 0.
|
// i.e., we test that info[tid].users equals 0.
|
||||||
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
|
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
|
||||||
while(info[tid].users!=0) {}
|
while(info[tid].users!=0) {}
|
||||||
info[tid].users += threads;
|
info[tid].users += threads;
|
||||||
|
|
||||||
pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
|
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
|
||||||
|
|
||||||
// Notify the other threads that the part B'_j is ready to go.
|
// Notify the other threads that the part A'_i is ready to go.
|
||||||
info[tid].sync = k;
|
info[tid].sync = k;
|
||||||
|
|
||||||
// Computes C_i += A' * B' per B'_j
|
// Computes C_i += A' * B' per A'_i
|
||||||
for(Index shift=0; shift<threads; ++shift)
|
for(Index shift=0; shift<threads; ++shift)
|
||||||
{
|
{
|
||||||
Index j = (tid+shift)%threads;
|
Index i = (tid+shift)%threads;
|
||||||
|
|
||||||
// At this point we have to make sure that B'_j has been updated by the thread j,
|
// At this point we have to make sure that A'_i has been updated by the thread i,
|
||||||
// we use testAndSetOrdered to mimic a volatile access.
|
// we use testAndSetOrdered to mimic a volatile access.
|
||||||
// However, no need to wait for the B' part which has been updated by the current thread!
|
// However, no need to wait for the B' part which has been updated by the current thread!
|
||||||
if(shift>0)
|
if(shift>0)
|
||||||
while(info[j].sync!=k) {}
|
while(info[i].sync!=k) {}
|
||||||
|
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
|
||||||
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Then keep going as usual with the remaining A'
|
// Then keep going as usual with the remaining B'
|
||||||
for(Index i=mc; i<rows; i+=mc)
|
for(Index j=nc; j<cols; j+=nc)
|
||||||
{
|
{
|
||||||
const Index actual_mc = (std::min)(i+mc,rows)-i;
|
const Index actual_nc = (std::min)(j+nc,cols)-j;
|
||||||
|
|
||||||
// pack A_i,k to A'
|
// pack B_k,j to B'
|
||||||
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
|
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
|
||||||
|
|
||||||
// C_i += A' * B'
|
// C_j += A' * B'
|
||||||
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
|
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Release all the sub blocks B'_j of B' for the current thread,
|
// Release all the sub blocks A'_i of A' for the current thread,
|
||||||
// i.e., we simply decrement the number of users by 1
|
// i.e., we simply decrement the number of users by 1
|
||||||
for(Index j=0; j<threads; ++j)
|
#pragma omp critical
|
||||||
|
{
|
||||||
|
for(Index i=0; i<threads; ++i)
|
||||||
#pragma omp atomic
|
#pragma omp atomic
|
||||||
--(info[j].users);
|
--(info[i].users);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -151,38 +153,34 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
|
|
||||||
// this is the sequential version!
|
// this is the sequential version!
|
||||||
std::size_t sizeA = kc*mc;
|
std::size_t sizeA = kc*mc;
|
||||||
std::size_t sizeB = kc*cols;
|
std::size_t sizeB = kc*nc;
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
|
|
||||||
|
|
||||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||||
// (==GEMM_VAR1)
|
|
||||||
for(Index k2=0; k2<depth; k2+=kc)
|
for(Index k2=0; k2<depth; k2+=kc)
|
||||||
{
|
{
|
||||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||||
|
|
||||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||||
// => Pack rhs's panel into a sequential chunk of memory (L2 caching)
|
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||||
// Note that this panel will be read as many times as the number of blocks in the lhs's
|
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||||
// vertical panel which is, in practice, a very low number.
|
// horizontal panel which is, in practice, a very low number.
|
||||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
|
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
|
||||||
|
|
||||||
// For each mc x kc block of the lhs's vertical panel...
|
// For each kc x nc block of the rhs's horizontal panel...
|
||||||
// (==GEPP_VAR1)
|
for(Index j2=0; j2<cols; j2+=nc)
|
||||||
for(Index i2=0; i2<rows; i2+=mc)
|
|
||||||
{
|
{
|
||||||
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
|
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||||
|
|
||||||
// We pack the lhs's block into a sequential chunk of memory (L1 caching)
|
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||||
// micro vertical panel of the large rhs's panel (e.g., cols/4 times).
|
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||||
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
|
||||||
|
|
||||||
// Everything is packed, we can now call the block * panel kernel:
|
// Everything is packed, we can now call the panel * block kernel:
|
||||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
|
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -203,14 +201,13 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
|
|||||||
template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
|
template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
|
||||||
struct gemm_functor
|
struct gemm_functor
|
||||||
{
|
{
|
||||||
gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
|
gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
|
||||||
BlockingType& blocking)
|
|
||||||
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
|
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void initParallelSession() const
|
void initParallelSession() const
|
||||||
{
|
{
|
||||||
m_blocking.allocateB();
|
m_blocking.allocateA();
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
|
void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
|
||||||
@ -225,6 +222,8 @@ struct gemm_functor
|
|||||||
m_actualAlpha, m_blocking, info);
|
m_actualAlpha, m_blocking, info);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef typename Gemm::Traits Traits;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
const Lhs& m_lhs;
|
const Lhs& m_lhs;
|
||||||
const Rhs& m_rhs;
|
const Rhs& m_rhs;
|
||||||
@ -245,7 +244,6 @@ class level3_blocking
|
|||||||
protected:
|
protected:
|
||||||
LhsScalar* m_blockA;
|
LhsScalar* m_blockA;
|
||||||
RhsScalar* m_blockB;
|
RhsScalar* m_blockB;
|
||||||
RhsScalar* m_blockW;
|
|
||||||
|
|
||||||
DenseIndex m_mc;
|
DenseIndex m_mc;
|
||||||
DenseIndex m_nc;
|
DenseIndex m_nc;
|
||||||
@ -254,7 +252,7 @@ class level3_blocking
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
level3_blocking()
|
level3_blocking()
|
||||||
: m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
|
: m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
inline DenseIndex mc() const { return m_mc; }
|
inline DenseIndex mc() const { return m_mc; }
|
||||||
@ -263,7 +261,6 @@ class level3_blocking
|
|||||||
|
|
||||||
inline LhsScalar* blockA() { return m_blockA; }
|
inline LhsScalar* blockA() { return m_blockA; }
|
||||||
inline RhsScalar* blockB() { return m_blockB; }
|
inline RhsScalar* blockB() { return m_blockB; }
|
||||||
inline RhsScalar* blockW() { return m_blockW; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
|
template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
|
||||||
@ -282,29 +279,25 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||||
enum {
|
enum {
|
||||||
SizeA = ActualRows * MaxDepth,
|
SizeA = ActualRows * MaxDepth,
|
||||||
SizeB = ActualCols * MaxDepth,
|
SizeB = ActualCols * MaxDepth
|
||||||
SizeW = MaxDepth * Traits::WorkSpaceFactor
|
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
|
EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
|
||||||
EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
|
EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
|
||||||
EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
|
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
|
||||||
{
|
{
|
||||||
this->m_mc = ActualRows;
|
this->m_mc = ActualRows;
|
||||||
this->m_nc = ActualCols;
|
this->m_nc = ActualCols;
|
||||||
this->m_kc = MaxDepth;
|
this->m_kc = MaxDepth;
|
||||||
this->m_blockA = m_staticA;
|
this->m_blockA = m_staticA;
|
||||||
this->m_blockB = m_staticB;
|
this->m_blockB = m_staticB;
|
||||||
this->m_blockW = m_staticW;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void allocateA() {}
|
inline void allocateA() {}
|
||||||
inline void allocateB() {}
|
inline void allocateB() {}
|
||||||
inline void allocateW() {}
|
|
||||||
inline void allocateAll() {}
|
inline void allocateAll() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -323,20 +316,28 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
|
|
||||||
DenseIndex m_sizeA;
|
DenseIndex m_sizeA;
|
||||||
DenseIndex m_sizeB;
|
DenseIndex m_sizeB;
|
||||||
DenseIndex m_sizeW;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
|
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
|
||||||
{
|
{
|
||||||
this->m_mc = Transpose ? cols : rows;
|
this->m_mc = Transpose ? cols : rows;
|
||||||
this->m_nc = Transpose ? rows : cols;
|
this->m_nc = Transpose ? rows : cols;
|
||||||
this->m_kc = depth;
|
this->m_kc = depth;
|
||||||
|
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
|
if(full_rows)
|
||||||
|
{
|
||||||
|
DenseIndex m = this->m_mc;
|
||||||
|
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
|
||||||
|
}
|
||||||
|
else // full columns
|
||||||
|
{
|
||||||
|
DenseIndex n = this->m_nc;
|
||||||
|
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
|
||||||
|
}
|
||||||
|
|
||||||
m_sizeA = this->m_mc * this->m_kc;
|
m_sizeA = this->m_mc * this->m_kc;
|
||||||
m_sizeB = this->m_kc * this->m_nc;
|
m_sizeB = this->m_kc * this->m_nc;
|
||||||
m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void allocateA()
|
void allocateA()
|
||||||
@ -351,24 +352,16 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
|
this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
|
||||||
}
|
}
|
||||||
|
|
||||||
void allocateW()
|
|
||||||
{
|
|
||||||
if(this->m_blockW==0)
|
|
||||||
this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
|
|
||||||
}
|
|
||||||
|
|
||||||
void allocateAll()
|
void allocateAll()
|
||||||
{
|
{
|
||||||
allocateA();
|
allocateA();
|
||||||
allocateB();
|
allocateB();
|
||||||
allocateW();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~gemm_blocking_space()
|
~gemm_blocking_space()
|
||||||
{
|
{
|
||||||
aligned_delete(this->m_blockA, m_sizeA);
|
aligned_delete(this->m_blockA, m_sizeA);
|
||||||
aligned_delete(this->m_blockB, m_sizeB);
|
aligned_delete(this->m_blockB, m_sizeB);
|
||||||
aligned_delete(this->m_blockW, m_sizeW);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -395,6 +388,36 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
|
|||||||
EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
|
EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Dest>
|
||||||
|
inline void evalTo(Dest& dst) const
|
||||||
|
{
|
||||||
|
if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
|
||||||
|
dst.noalias() = m_lhs .lazyProduct( m_rhs );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
dst.setZero();
|
||||||
|
scaleAndAddTo(dst,Scalar(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Dest>
|
||||||
|
inline void addTo(Dest& dst) const
|
||||||
|
{
|
||||||
|
if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
|
||||||
|
dst.noalias() += m_lhs .lazyProduct( m_rhs );
|
||||||
|
else
|
||||||
|
scaleAndAddTo(dst,Scalar(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Dest>
|
||||||
|
inline void subTo(Dest& dst) const
|
||||||
|
{
|
||||||
|
if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
|
||||||
|
dst.noalias() -= m_lhs .lazyProduct( m_rhs );
|
||||||
|
else
|
||||||
|
scaleAndAddTo(dst,Scalar(-1));
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
|
template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
|
||||||
{
|
{
|
||||||
eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
|
||||||
@ -417,7 +440,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
|
|||||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
||||||
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
|
_ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
|
||||||
|
|
||||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
|
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
|
||||||
|
|
||||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
|
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
|
||||||
}
|
}
|
||||||
|
@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
|||||||
if(mc > Traits::nr)
|
if(mc > Traits::nr)
|
||||||
mc = (mc/Traits::nr)*Traits::nr;
|
mc = (mc/Traits::nr)*Traits::nr;
|
||||||
|
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
|
||||||
std::size_t sizeB = sizeW + kc*size;
|
|
||||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
|
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
|
||||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
|
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
|
||||||
RhsScalar* blockB = allocatedBlockB + sizeW;
|
|
||||||
|
|
||||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||||
@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
|||||||
// 3 - after the diagonal => processed with gebp or skipped
|
// 3 - after the diagonal => processed with gebp or skipped
|
||||||
if (UpLo==Lower)
|
if (UpLo==Lower)
|
||||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
|
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
|
||||||
-1, -1, 0, 0, allocatedBlockB);
|
-1, -1, 0, 0);
|
||||||
|
|
||||||
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
|
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||||
|
|
||||||
if (UpLo==Upper)
|
if (UpLo==Upper)
|
||||||
{
|
{
|
||||||
Index j2 = i2+actual_mc;
|
Index j2 = i2+actual_mc;
|
||||||
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
|
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
|
||||||
-1, -1, 0, 0, allocatedBlockB);
|
-1, -1, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -136,7 +133,7 @@ struct tribb_kernel
|
|||||||
enum {
|
enum {
|
||||||
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
|
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
|
||||||
};
|
};
|
||||||
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
|
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||||
{
|
{
|
||||||
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
|
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
|
||||||
@ -150,7 +147,7 @@ struct tribb_kernel
|
|||||||
|
|
||||||
if(UpLo==Upper)
|
if(UpLo==Upper)
|
||||||
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
|
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||||
-1, -1, 0, 0, workspace);
|
-1, -1, 0, 0);
|
||||||
|
|
||||||
// selfadjoint micro block
|
// selfadjoint micro block
|
||||||
{
|
{
|
||||||
@ -158,7 +155,7 @@ struct tribb_kernel
|
|||||||
buffer.setZero();
|
buffer.setZero();
|
||||||
// 1 - apply the kernel on the temporary buffer
|
// 1 - apply the kernel on the temporary buffer
|
||||||
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||||
-1, -1, 0, 0, workspace);
|
-1, -1, 0, 0);
|
||||||
// 2 - triangular accumulation
|
// 2 - triangular accumulation
|
||||||
for(Index j1=0; j1<actualBlockSize; ++j1)
|
for(Index j1=0; j1<actualBlockSize; ++j1)
|
||||||
{
|
{
|
||||||
@ -173,7 +170,7 @@ struct tribb_kernel
|
|||||||
{
|
{
|
||||||
Index i = j+actualBlockSize;
|
Index i = j+actualBlockSize;
|
||||||
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
|
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
|
||||||
-1, -1, 0, 0, workspace);
|
-1, -1, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -278,6 +275,8 @@ template<typename MatrixType, unsigned int UpLo>
|
|||||||
template<typename ProductDerived, typename _Lhs, typename _Rhs>
|
template<typename ProductDerived, typename _Lhs, typename _Rhs>
|
||||||
TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
|
TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
|
||||||
{
|
{
|
||||||
|
eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
|
||||||
|
|
||||||
general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
|
general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
|
||||||
|
|
||||||
return *this;
|
return *this;
|
||||||
|
@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
|||||||
alignedSize = 0;
|
alignedSize = 0;
|
||||||
alignedStart = 0;
|
alignedStart = 0;
|
||||||
}
|
}
|
||||||
|
else if(LhsPacketSize > 4)
|
||||||
|
{
|
||||||
|
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
||||||
|
// Currently, it seems to be better to perform unaligned loads anyway
|
||||||
|
alignmentPattern = NoneAligned;
|
||||||
|
}
|
||||||
else if (LhsPacketSize>1)
|
else if (LhsPacketSize>1)
|
||||||
{
|
{
|
||||||
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
||||||
@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
|||||||
alignedSize = 0;
|
alignedSize = 0;
|
||||||
alignedStart = 0;
|
alignedStart = 0;
|
||||||
}
|
}
|
||||||
|
else if(LhsPacketSize > 4)
|
||||||
|
{
|
||||||
|
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
|
||||||
|
alignmentPattern = NoneAligned;
|
||||||
|
}
|
||||||
else if (LhsPacketSize>1)
|
else if (LhsPacketSize>1)
|
||||||
{
|
{
|
||||||
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
||||||
@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
|||||||
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
||||||
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
|
||||||
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
||||||
|
|
||||||
// this helps the compiler generating good binary code
|
// this helps the compiler generating good binary code
|
||||||
@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
|||||||
{
|
{
|
||||||
for (Index i=start; i<end; ++i)
|
for (Index i=start; i<end; ++i)
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
|
||||||
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
||||||
const LhsScalar* lhs0 = lhs + i*lhsStride;
|
const LhsScalar* lhs0 = lhs + i*lhsStride;
|
||||||
// process first unaligned result's coeffs
|
// process first unaligned result's coeffs
|
||||||
|
@ -73,13 +73,13 @@ namespace internal {
|
|||||||
|
|
||||||
template<typename Index> struct GemmParallelInfo
|
template<typename Index> struct GemmParallelInfo
|
||||||
{
|
{
|
||||||
GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
|
GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
|
||||||
|
|
||||||
int volatile sync;
|
int volatile sync;
|
||||||
int volatile users;
|
int volatile users;
|
||||||
|
|
||||||
Index rhs_start;
|
Index lhs_start;
|
||||||
Index rhs_length;
|
Index lhs_length;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<bool Condition, typename Functor, typename Index>
|
template<bool Condition, typename Functor, typename Index>
|
||||||
@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
if((!Condition) || (omp_get_num_threads()>1))
|
if((!Condition) || (omp_get_num_threads()>1))
|
||||||
return func(0,rows, 0,cols);
|
return func(0,rows, 0,cols);
|
||||||
|
|
||||||
Index size = transpose ? cols : rows;
|
Index size = transpose ? rows : cols;
|
||||||
|
|
||||||
// 2- compute the maximal number of threads from the size of the product:
|
// 2- compute the maximal number of threads from the size of the product:
|
||||||
// FIXME this has to be fine tuned
|
// FIXME this has to be fine tuned
|
||||||
@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
std::swap(rows,cols);
|
std::swap(rows,cols);
|
||||||
|
|
||||||
Index blockCols = (cols / threads) & ~Index(0x3);
|
Index blockCols = (cols / threads) & ~Index(0x3);
|
||||||
Index blockRows = (rows / threads) & ~Index(0x7);
|
Index blockRows = (rows / threads);
|
||||||
|
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
|
||||||
|
|
||||||
GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
|
GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static,1) num_threads(threads)
|
#pragma omp parallel num_threads(threads)
|
||||||
for(Index i=0; i<threads; ++i)
|
|
||||||
{
|
{
|
||||||
|
Index i = omp_get_thread_num();
|
||||||
Index r0 = i*blockRows;
|
Index r0 = i*blockRows;
|
||||||
Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
|
Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
|
||||||
|
|
||||||
Index c0 = i*blockCols;
|
Index c0 = i*blockCols;
|
||||||
Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
|
Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
|
||||||
|
|
||||||
info[i].rhs_start = c0;
|
info[i].lhs_start = r0;
|
||||||
info[i].rhs_length = actualBlockCols;
|
info[i].lhs_length = actualBlockRows;
|
||||||
|
|
||||||
if(transpose)
|
if(transpose) func(c0, actualBlockCols, 0, rows, info);
|
||||||
func(0, cols, r0, actualBlockRows, info);
|
else func(0, rows, c0, actualBlockCols, info);
|
||||||
else
|
|
||||||
func(r0, actualBlockRows, 0,cols, info);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] info;
|
delete[] info;
|
||||||
|
@ -15,7 +15,7 @@ namespace Eigen {
|
|||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// pack a selfadjoint block diagonal for use with the gebp_kernel
|
// pack a selfadjoint block diagonal for use with the gebp_kernel
|
||||||
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
|
template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
|
||||||
struct symm_pack_lhs
|
struct symm_pack_lhs
|
||||||
{
|
{
|
||||||
template<int BlockRows> inline
|
template<int BlockRows> inline
|
||||||
@ -45,25 +45,32 @@ struct symm_pack_lhs
|
|||||||
}
|
}
|
||||||
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
|
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
|
||||||
{
|
{
|
||||||
|
enum { PacketSize = packet_traits<Scalar>::size };
|
||||||
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
|
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
|
||||||
Index count = 0;
|
Index count = 0;
|
||||||
Index peeled_mc = (rows/Pack1)*Pack1;
|
//Index peeled_mc3 = (rows/Pack1)*Pack1;
|
||||||
for(Index i=0; i<peeled_mc; i+=Pack1)
|
|
||||||
{
|
|
||||||
pack<Pack1>(blockA, lhs, cols, i, count);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(rows-peeled_mc>=Pack2)
|
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
|
||||||
{
|
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
|
||||||
pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
|
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
|
||||||
peeled_mc += Pack2;
|
|
||||||
}
|
if(Pack1>=3*PacketSize)
|
||||||
|
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
|
||||||
|
pack<3*PacketSize>(blockA, lhs, cols, i, count);
|
||||||
|
|
||||||
|
if(Pack1>=2*PacketSize)
|
||||||
|
for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
|
||||||
|
pack<2*PacketSize>(blockA, lhs, cols, i, count);
|
||||||
|
|
||||||
|
if(Pack1>=1*PacketSize)
|
||||||
|
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
|
||||||
|
pack<1*PacketSize>(blockA, lhs, cols, i, count);
|
||||||
|
|
||||||
// do the same with mr==1
|
// do the same with mr==1
|
||||||
for(Index i=peeled_mc; i<rows; i++)
|
for(Index i=peeled_mc1; i<rows; i++)
|
||||||
{
|
{
|
||||||
for(Index k=0; k<i; k++)
|
for(Index k=0; k<i; k++)
|
||||||
blockA[count++] = lhs(i, k); // normal
|
blockA[count++] = lhs(i, k); // normal
|
||||||
|
|
||||||
blockA[count++] = numext::real(lhs(i, i)); // real (diagonal)
|
blockA[count++] = numext::real(lhs(i, i)); // real (diagonal)
|
||||||
|
|
||||||
@ -82,7 +89,8 @@ struct symm_pack_rhs
|
|||||||
Index end_k = k2 + rows;
|
Index end_k = k2 + rows;
|
||||||
Index count = 0;
|
Index count = 0;
|
||||||
const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
|
const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
|
||||||
Index packet_cols = (cols/nr)*nr;
|
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
|
||||||
|
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
|
||||||
|
|
||||||
// first part: normal case
|
// first part: normal case
|
||||||
for(Index j2=0; j2<k2; j2+=nr)
|
for(Index j2=0; j2<k2; j2+=nr)
|
||||||
@ -91,79 +99,151 @@ struct symm_pack_rhs
|
|||||||
{
|
{
|
||||||
blockB[count+0] = rhs(k,j2+0);
|
blockB[count+0] = rhs(k,j2+0);
|
||||||
blockB[count+1] = rhs(k,j2+1);
|
blockB[count+1] = rhs(k,j2+1);
|
||||||
if (nr==4)
|
if (nr>=4)
|
||||||
{
|
{
|
||||||
blockB[count+2] = rhs(k,j2+2);
|
blockB[count+2] = rhs(k,j2+2);
|
||||||
blockB[count+3] = rhs(k,j2+3);
|
blockB[count+3] = rhs(k,j2+3);
|
||||||
}
|
}
|
||||||
|
if (nr>=8)
|
||||||
|
{
|
||||||
|
blockB[count+4] = rhs(k,j2+4);
|
||||||
|
blockB[count+5] = rhs(k,j2+5);
|
||||||
|
blockB[count+6] = rhs(k,j2+6);
|
||||||
|
blockB[count+7] = rhs(k,j2+7);
|
||||||
|
}
|
||||||
count += nr;
|
count += nr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// second part: diagonal block
|
// second part: diagonal block
|
||||||
for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
|
Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
|
||||||
|
if(nr>=8)
|
||||||
{
|
{
|
||||||
// again we can split vertically in three different parts (transpose, symmetric, normal)
|
for(Index j2=k2; j2<end8; j2+=8)
|
||||||
// transpose
|
|
||||||
for(Index k=k2; k<j2; k++)
|
|
||||||
{
|
{
|
||||||
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
// again we can split vertically in three different parts (transpose, symmetric, normal)
|
||||||
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
// transpose
|
||||||
if (nr==4)
|
for(Index k=k2; k<j2; k++)
|
||||||
{
|
{
|
||||||
|
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
||||||
|
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
||||||
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
||||||
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
||||||
|
blockB[count+4] = numext::conj(rhs(j2+4,k));
|
||||||
|
blockB[count+5] = numext::conj(rhs(j2+5,k));
|
||||||
|
blockB[count+6] = numext::conj(rhs(j2+6,k));
|
||||||
|
blockB[count+7] = numext::conj(rhs(j2+7,k));
|
||||||
|
count += 8;
|
||||||
}
|
}
|
||||||
count += nr;
|
// symmetric
|
||||||
}
|
Index h = 0;
|
||||||
// symmetric
|
for(Index k=j2; k<j2+8; k++)
|
||||||
Index h = 0;
|
|
||||||
for(Index k=j2; k<j2+nr; k++)
|
|
||||||
{
|
|
||||||
// normal
|
|
||||||
for (Index w=0 ; w<h; ++w)
|
|
||||||
blockB[count+w] = rhs(k,j2+w);
|
|
||||||
|
|
||||||
blockB[count+h] = numext::real(rhs(k,k));
|
|
||||||
|
|
||||||
// transpose
|
|
||||||
for (Index w=h+1 ; w<nr; ++w)
|
|
||||||
blockB[count+w] = numext::conj(rhs(j2+w,k));
|
|
||||||
count += nr;
|
|
||||||
++h;
|
|
||||||
}
|
|
||||||
// normal
|
|
||||||
for(Index k=j2+nr; k<end_k; k++)
|
|
||||||
{
|
|
||||||
blockB[count+0] = rhs(k,j2+0);
|
|
||||||
blockB[count+1] = rhs(k,j2+1);
|
|
||||||
if (nr==4)
|
|
||||||
{
|
{
|
||||||
|
// normal
|
||||||
|
for (Index w=0 ; w<h; ++w)
|
||||||
|
blockB[count+w] = rhs(k,j2+w);
|
||||||
|
|
||||||
|
blockB[count+h] = numext::real(rhs(k,k));
|
||||||
|
|
||||||
|
// transpose
|
||||||
|
for (Index w=h+1 ; w<8; ++w)
|
||||||
|
blockB[count+w] = numext::conj(rhs(j2+w,k));
|
||||||
|
count += 8;
|
||||||
|
++h;
|
||||||
|
}
|
||||||
|
// normal
|
||||||
|
for(Index k=j2+8; k<end_k; k++)
|
||||||
|
{
|
||||||
|
blockB[count+0] = rhs(k,j2+0);
|
||||||
|
blockB[count+1] = rhs(k,j2+1);
|
||||||
blockB[count+2] = rhs(k,j2+2);
|
blockB[count+2] = rhs(k,j2+2);
|
||||||
blockB[count+3] = rhs(k,j2+3);
|
blockB[count+3] = rhs(k,j2+3);
|
||||||
|
blockB[count+4] = rhs(k,j2+4);
|
||||||
|
blockB[count+5] = rhs(k,j2+5);
|
||||||
|
blockB[count+6] = rhs(k,j2+6);
|
||||||
|
blockB[count+7] = rhs(k,j2+7);
|
||||||
|
count += 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(nr>=4)
|
||||||
|
{
|
||||||
|
for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
|
||||||
|
{
|
||||||
|
// again we can split vertically in three different parts (transpose, symmetric, normal)
|
||||||
|
// transpose
|
||||||
|
for(Index k=k2; k<j2; k++)
|
||||||
|
{
|
||||||
|
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
||||||
|
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
||||||
|
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
||||||
|
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
||||||
|
count += 4;
|
||||||
|
}
|
||||||
|
// symmetric
|
||||||
|
Index h = 0;
|
||||||
|
for(Index k=j2; k<j2+4; k++)
|
||||||
|
{
|
||||||
|
// normal
|
||||||
|
for (Index w=0 ; w<h; ++w)
|
||||||
|
blockB[count+w] = rhs(k,j2+w);
|
||||||
|
|
||||||
|
blockB[count+h] = numext::real(rhs(k,k));
|
||||||
|
|
||||||
|
// transpose
|
||||||
|
for (Index w=h+1 ; w<4; ++w)
|
||||||
|
blockB[count+w] = numext::conj(rhs(j2+w,k));
|
||||||
|
count += 4;
|
||||||
|
++h;
|
||||||
|
}
|
||||||
|
// normal
|
||||||
|
for(Index k=j2+4; k<end_k; k++)
|
||||||
|
{
|
||||||
|
blockB[count+0] = rhs(k,j2+0);
|
||||||
|
blockB[count+1] = rhs(k,j2+1);
|
||||||
|
blockB[count+2] = rhs(k,j2+2);
|
||||||
|
blockB[count+3] = rhs(k,j2+3);
|
||||||
|
count += 4;
|
||||||
}
|
}
|
||||||
count += nr;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// third part: transposed
|
// third part: transposed
|
||||||
for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
|
if(nr>=8)
|
||||||
{
|
{
|
||||||
for(Index k=k2; k<end_k; k++)
|
for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
|
||||||
{
|
{
|
||||||
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
for(Index k=k2; k<end_k; k++)
|
||||||
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
|
||||||
if (nr==4)
|
|
||||||
{
|
{
|
||||||
|
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
||||||
|
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
||||||
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
||||||
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
||||||
|
blockB[count+4] = numext::conj(rhs(j2+4,k));
|
||||||
|
blockB[count+5] = numext::conj(rhs(j2+5,k));
|
||||||
|
blockB[count+6] = numext::conj(rhs(j2+6,k));
|
||||||
|
blockB[count+7] = numext::conj(rhs(j2+7,k));
|
||||||
|
count += 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(nr>=4)
|
||||||
|
{
|
||||||
|
for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
|
||||||
|
{
|
||||||
|
for(Index k=k2; k<end_k; k++)
|
||||||
|
{
|
||||||
|
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
||||||
|
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
||||||
|
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
||||||
|
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
||||||
|
count += 4;
|
||||||
}
|
}
|
||||||
count += nr;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the remaining columns one at a time (=> the same with nr==1)
|
// copy the remaining columns one at a time (=> the same with nr==1)
|
||||||
for(Index j2=packet_cols; j2<cols; ++j2)
|
for(Index j2=packet_cols4; j2<cols; ++j2)
|
||||||
{
|
{
|
||||||
// transpose
|
// transpose
|
||||||
Index half = (std::min)(end_k,j2);
|
Index half = (std::min)(end_k,j2);
|
||||||
@ -261,11 +341,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
|||||||
// kc must smaller than mc
|
// kc must smaller than mc
|
||||||
kc = (std::min)(kc,mc);
|
kc = (std::min)(kc,mc);
|
||||||
|
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
std::size_t sizeB = kc*cols;
|
||||||
std::size_t sizeB = sizeW + kc*cols;
|
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||||
Scalar* blockB = allocatedBlockB + sizeW;
|
Scalar* blockB = allocatedBlockB;
|
||||||
|
|
||||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||||
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||||
@ -348,11 +427,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
|||||||
Index mc = rows; // cache block size along the M direction
|
Index mc = rows; // cache block size along the M direction
|
||||||
Index nc = cols; // cache block size along the N direction
|
Index nc = cols; // cache block size along the N direction
|
||||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
std::size_t sizeB = kc*cols;
|
||||||
std::size_t sizeB = sizeW + kc*cols;
|
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||||
Scalar* blockB = allocatedBlockB + sizeW;
|
Scalar* blockB = allocatedBlockB;
|
||||||
|
|
||||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||||
@ -423,11 +501,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
|
|||||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
|
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
|
||||||
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
||||||
::run(
|
::run(
|
||||||
lhs.rows(), rhs.cols(), // sizes
|
lhs.rows(), rhs.cols(), // sizes
|
||||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||||
actualAlpha // alpha
|
actualAlpha // alpha
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||||||
|
|
||||||
std::size_t sizeA = kc*mc;
|
std::size_t sizeA = kc*mc;
|
||||||
std::size_t sizeB = kc*cols;
|
std::size_t sizeB = kc*cols;
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
|
||||||
|
|
||||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
|
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
|
||||||
triangularBuffer.setZero();
|
triangularBuffer.setZero();
|
||||||
@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||||||
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
|
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
|
||||||
|
|
||||||
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
|
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
|
||||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||||
|
|
||||||
// GEBP with remaining micro panel
|
// GEBP with remaining micro panel
|
||||||
if (lengthTarget>0)
|
if (lengthTarget>0)
|
||||||
@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||||||
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
||||||
|
|
||||||
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
|
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
|
||||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||||||
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
||||||
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||||
|
|
||||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
|
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -265,12 +263,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||||
|
|
||||||
std::size_t sizeA = kc*mc;
|
std::size_t sizeA = kc*mc;
|
||||||
std::size_t sizeB = kc*cols;
|
std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
|
||||||
|
|
||||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
|
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
|
||||||
triangularBuffer.setZero();
|
triangularBuffer.setZero();
|
||||||
@ -304,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||||||
Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
|
Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
|
||||||
|
|
||||||
Scalar* geb = blockB+ts*ts;
|
Scalar* geb = blockB+ts*ts;
|
||||||
|
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
|
||||||
|
|
||||||
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
|
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
|
||||||
|
|
||||||
@ -357,14 +354,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||||||
actual_mc, panelLength, actualPanelWidth,
|
actual_mc, panelLength, actualPanelWidth,
|
||||||
alpha,
|
alpha,
|
||||||
actual_kc, actual_kc, // strides
|
actual_kc, actual_kc, // strides
|
||||||
blockOffset, blockOffset,// offsets
|
blockOffset, blockOffset);// offsets
|
||||||
blockW); // workspace
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
|
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
|
||||||
blockA, geb, actual_mc, actual_kc, rs,
|
blockA, geb, actual_mc, actual_kc, rs,
|
||||||
alpha,
|
alpha,
|
||||||
-1, -1, 0, 0, blockW);
|
-1, -1, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||||||
MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
|
MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
|
||||||
/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
|
/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
|
||||||
if (size<(std::max)(rows,cols)) { \
|
if (size<(std::max)(rows,cols)) { \
|
||||||
typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
|
|
||||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||||
x = x_tmp.data(); \
|
x = x_tmp.data(); \
|
||||||
if (size<rows) { \
|
if (size<rows) { \
|
||||||
@ -214,7 +213,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||||||
MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
|
MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
|
||||||
/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
|
/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
|
||||||
if (size<(std::max)(rows,cols)) { \
|
if (size<(std::max)(rows,cols)) { \
|
||||||
typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
|
|
||||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||||
x = x_tmp.data(); \
|
x = x_tmp.data(); \
|
||||||
if (size<rows) { \
|
if (size<rows) { \
|
||||||
|
@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|||||||
|
|
||||||
std::size_t sizeA = kc*mc;
|
std::size_t sizeA = kc*mc;
|
||||||
std::size_t sizeB = kc*cols;
|
std::size_t sizeB = kc*cols;
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
|
||||||
|
|
||||||
conj_if<Conjugate> conj;
|
conj_if<Conjugate> conj;
|
||||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||||
@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|||||||
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
||||||
|
|
||||||
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
|||||||
{
|
{
|
||||||
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
|
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
|
||||||
|
|
||||||
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
|
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
|||||||
|
|
||||||
std::size_t sizeA = kc*mc;
|
std::size_t sizeA = kc*mc;
|
||||||
std::size_t sizeB = kc*size;
|
std::size_t sizeB = kc*size;
|
||||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
|
||||||
|
|
||||||
conj_if<Conjugate> conj;
|
conj_if<Conjugate> conj;
|
||||||
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||||
@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
|||||||
actual_mc, panelLength, actualPanelWidth,
|
actual_mc, panelLength, actualPanelWidth,
|
||||||
Scalar(-1),
|
Scalar(-1),
|
||||||
actual_kc, actual_kc, // strides
|
actual_kc, actual_kc, // strides
|
||||||
panelOffset, panelOffset, // offsets
|
panelOffset, panelOffset); // offsets
|
||||||
blockW); // workspace
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// unblocked triangular solve
|
// unblocked triangular solve
|
||||||
@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
|||||||
if (rs>0)
|
if (rs>0)
|
||||||
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
|
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
|
||||||
actual_mc, actual_kc, rs, Scalar(-1),
|
actual_mc, actual_kc, rs, Scalar(-1),
|
||||||
-1, -1, 0, 0, blockW);
|
-1, -1, 0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -54,8 +54,25 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined EIGEN_USE_MKL
|
#if defined EIGEN_USE_MKL
|
||||||
|
# include <mkl.h>
|
||||||
|
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
|
||||||
|
# ifndef INTEL_MKL_VERSION
|
||||||
|
# undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
|
||||||
|
# elif INTEL_MKL_VERSION < 100305 /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
|
||||||
|
# undef EIGEN_USE_MKL
|
||||||
|
# endif
|
||||||
|
# ifndef EIGEN_USE_MKL
|
||||||
|
/*If the MKL version is too old, undef everything*/
|
||||||
|
# undef EIGEN_USE_MKL_ALL
|
||||||
|
# undef EIGEN_USE_BLAS
|
||||||
|
# undef EIGEN_USE_LAPACKE
|
||||||
|
# undef EIGEN_USE_MKL_VML
|
||||||
|
# undef EIGEN_USE_LAPACKE_STRICT
|
||||||
|
# undef EIGEN_USE_LAPACKE
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <mkl.h>
|
#if defined EIGEN_USE_MKL
|
||||||
#include <mkl_lapacke.h>
|
#include <mkl_lapacke.h>
|
||||||
#define EIGEN_MKL_VML_THRESHOLD 128
|
#define EIGEN_MKL_VML_THRESHOLD 128
|
||||||
|
|
||||||
|
@ -66,13 +66,24 @@
|
|||||||
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
|
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Defined the boundary (in bytes) on which the data needs to be aligned. Note
|
||||||
|
// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
|
||||||
|
// aligned at all regardless of the value of this #define.
|
||||||
|
#define EIGEN_ALIGN_BYTES 16
|
||||||
|
|
||||||
#ifdef EIGEN_DONT_ALIGN
|
#ifdef EIGEN_DONT_ALIGN
|
||||||
#ifndef EIGEN_DONT_ALIGN_STATICALLY
|
#ifndef EIGEN_DONT_ALIGN_STATICALLY
|
||||||
#define EIGEN_DONT_ALIGN_STATICALLY
|
#define EIGEN_DONT_ALIGN_STATICALLY
|
||||||
#endif
|
#endif
|
||||||
#define EIGEN_ALIGN 0
|
#define EIGEN_ALIGN 0
|
||||||
#else
|
#elif !defined(EIGEN_DONT_VECTORIZE)
|
||||||
|
#if defined(__AVX__)
|
||||||
|
#undef EIGEN_ALIGN_BYTES
|
||||||
|
#define EIGEN_ALIGN_BYTES 32
|
||||||
|
#endif
|
||||||
#define EIGEN_ALIGN 1
|
#define EIGEN_ALIGN 1
|
||||||
|
#else
|
||||||
|
#define EIGEN_ALIGN 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
|
// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
|
||||||
@ -286,13 +297,19 @@ namespace Eigen {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
|
#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
|
||||||
|
#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
|
||||||
|
#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
|
||||||
|
|
||||||
#if EIGEN_ALIGN_STATICALLY
|
#if EIGEN_ALIGN_STATICALLY
|
||||||
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
|
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
|
||||||
#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
|
#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
|
||||||
|
#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
|
||||||
|
#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT
|
||||||
#else
|
#else
|
||||||
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
|
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
|
||||||
#define EIGEN_USER_ALIGN16
|
#define EIGEN_USER_ALIGN16
|
||||||
|
#define EIGEN_USER_ALIGN32
|
||||||
|
#define EIGEN_USER_ALIGN_DEFAULT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
|
#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
|
||||||
@ -326,13 +343,13 @@ namespace Eigen {
|
|||||||
#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
||||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||||
using Base::operator =; \
|
using Base::operator =; \
|
||||||
EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
|
||||||
template <typename OtherDerived> \
|
template <typename OtherDerived> \
|
||||||
EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
|
||||||
#else
|
#else
|
||||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||||
using Base::operator =; \
|
using Base::operator =; \
|
||||||
EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
|
||||||
{ \
|
{ \
|
||||||
Base::operator=(other); \
|
Base::operator=(other); \
|
||||||
return *this; \
|
return *this; \
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
|
// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
|
||||||
// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
|
// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
|
||||||
#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
|
#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
|
||||||
&& defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
|
&& defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16)
|
||||||
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
|
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
|
||||||
#else
|
#else
|
||||||
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
|
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
|
||||||
@ -42,14 +42,14 @@
|
|||||||
// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
|
// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
|
||||||
// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
|
// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
|
||||||
// See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
|
// See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
|
||||||
#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
|
#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16)
|
||||||
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
|
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
|
||||||
#else
|
#else
|
||||||
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
|
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__APPLE__) \
|
#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \
|
||||||
|| defined(_WIN64) \
|
|| (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \
|
||||||
|| EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
|
|| EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
|
||||||
|| EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
|
|| EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
|
||||||
#define EIGEN_MALLOC_ALREADY_ALIGNED 1
|
#define EIGEN_MALLOC_ALREADY_ALIGNED 1
|
||||||
@ -73,7 +73,7 @@
|
|||||||
#define EIGEN_HAS_POSIX_MEMALIGN 0
|
#define EIGEN_HAS_POSIX_MEMALIGN 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_SSE
|
#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
|
||||||
#define EIGEN_HAS_MM_MALLOC 1
|
#define EIGEN_HAS_MM_MALLOC 1
|
||||||
#else
|
#else
|
||||||
#define EIGEN_HAS_MM_MALLOC 0
|
#define EIGEN_HAS_MM_MALLOC 0
|
||||||
@ -89,7 +89,7 @@ inline void throw_std_bad_alloc()
|
|||||||
#ifdef EIGEN_EXCEPTIONS
|
#ifdef EIGEN_EXCEPTIONS
|
||||||
throw std::bad_alloc();
|
throw std::bad_alloc();
|
||||||
#else
|
#else
|
||||||
std::size_t huge = -1;
|
std::size_t huge = static_cast<std::size_t>(-1);
|
||||||
new int[huge];
|
new int[huge];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -105,9 +105,9 @@ inline void throw_std_bad_alloc()
|
|||||||
*/
|
*/
|
||||||
inline void* handmade_aligned_malloc(std::size_t size)
|
inline void* handmade_aligned_malloc(std::size_t size)
|
||||||
{
|
{
|
||||||
void *original = std::malloc(size+16);
|
void *original = std::malloc(size+EIGEN_ALIGN_BYTES);
|
||||||
if (original == 0) return 0;
|
if (original == 0) return 0;
|
||||||
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
|
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
|
||||||
*(reinterpret_cast<void**>(aligned) - 1) = original;
|
*(reinterpret_cast<void**>(aligned) - 1) = original;
|
||||||
return aligned;
|
return aligned;
|
||||||
}
|
}
|
||||||
@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
|
|||||||
if (ptr == 0) return handmade_aligned_malloc(size);
|
if (ptr == 0) return handmade_aligned_malloc(size);
|
||||||
void *original = *(reinterpret_cast<void**>(ptr) - 1);
|
void *original = *(reinterpret_cast<void**>(ptr) - 1);
|
||||||
std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
|
std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
|
||||||
original = std::realloc(original,size+16);
|
original = std::realloc(original,size+EIGEN_ALIGN_BYTES);
|
||||||
if (original == 0) return 0;
|
if (original == 0) return 0;
|
||||||
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
|
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
|
||||||
void *previous_aligned = static_cast<char *>(original)+previous_offset;
|
void *previous_aligned = static_cast<char *>(original)+previous_offset;
|
||||||
if(aligned!=previous_aligned)
|
if(aligned!=previous_aligned)
|
||||||
std::memmove(aligned, previous_aligned, size);
|
std::memmove(aligned, previous_aligned, size);
|
||||||
@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed()
|
|||||||
{}
|
{}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
|
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
|
||||||
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
|
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
|
||||||
*/
|
*/
|
||||||
inline void* aligned_malloc(size_t size)
|
inline void* aligned_malloc(size_t size)
|
||||||
@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size)
|
|||||||
#elif EIGEN_MALLOC_ALREADY_ALIGNED
|
#elif EIGEN_MALLOC_ALREADY_ALIGNED
|
||||||
result = std::malloc(size);
|
result = std::malloc(size);
|
||||||
#elif EIGEN_HAS_POSIX_MEMALIGN
|
#elif EIGEN_HAS_POSIX_MEMALIGN
|
||||||
if(posix_memalign(&result, 16, size)) result = 0;
|
if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0;
|
||||||
#elif EIGEN_HAS_MM_MALLOC
|
#elif EIGEN_HAS_MM_MALLOC
|
||||||
result = _mm_malloc(size, 16);
|
result = _mm_malloc(size, EIGEN_ALIGN_BYTES);
|
||||||
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
|
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
|
||||||
result = _aligned_malloc(size, 16);
|
result = _aligned_malloc(size, EIGEN_ALIGN_BYTES);
|
||||||
#else
|
#else
|
||||||
result = handmade_aligned_malloc(size);
|
result = handmade_aligned_malloc(size);
|
||||||
#endif
|
#endif
|
||||||
@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
|
|||||||
// implements _mm_malloc/_mm_free based on the corresponding _aligned_
|
// implements _mm_malloc/_mm_free based on the corresponding _aligned_
|
||||||
// functions. This may not always be the case and we just try to be safe.
|
// functions. This may not always be the case and we just try to be safe.
|
||||||
#if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
|
#if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
|
||||||
result = _aligned_realloc(ptr,new_size,16);
|
result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
|
||||||
#else
|
#else
|
||||||
result = generic_aligned_realloc(ptr,new_size,old_size);
|
result = generic_aligned_realloc(ptr,new_size,old_size);
|
||||||
#endif
|
#endif
|
||||||
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
|
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
|
||||||
result = _aligned_realloc(ptr,new_size,16);
|
result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
|
||||||
#else
|
#else
|
||||||
result = handmade_aligned_realloc(ptr,new_size,old_size);
|
result = handmade_aligned_realloc(ptr,new_size,old_size);
|
||||||
#endif
|
#endif
|
||||||
@ -552,7 +552,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
|
|||||||
// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
|
// you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
|
||||||
// to the appropriate stack allocation function
|
// to the appropriate stack allocation function
|
||||||
#ifndef EIGEN_ALLOCA
|
#ifndef EIGEN_ALLOCA
|
||||||
#if (defined __linux__)
|
#if (defined __linux__) || (defined __APPLE__)
|
||||||
#define EIGEN_ALLOCA alloca
|
#define EIGEN_ALLOCA alloca
|
||||||
#elif defined(_MSC_VER)
|
#elif defined(_MSC_VER)
|
||||||
#define EIGEN_ALLOCA _alloca
|
#define EIGEN_ALLOCA _alloca
|
||||||
@ -607,9 +607,9 @@ template<typename T> class aligned_stack_memory_handler
|
|||||||
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
|
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
|
||||||
*/
|
*/
|
||||||
#ifdef EIGEN_ALLOCA
|
#ifdef EIGEN_ALLOCA
|
||||||
|
// The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled.
|
||||||
#if defined(__arm__) || defined(_WIN32)
|
#if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16
|
||||||
#define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
|
#define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
|
||||||
#else
|
#else
|
||||||
#define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
|
#define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
|
||||||
#endif
|
#endif
|
||||||
@ -679,7 +679,7 @@ template<typename T> class aligned_stack_memory_handler
|
|||||||
|
|
||||||
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
|
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
|
||||||
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
|
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
|
||||||
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
|
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0)))
|
||||||
|
|
||||||
/****************************************************************************/
|
/****************************************************************************/
|
||||||
|
|
||||||
@ -700,98 +700,42 @@ template<typename T> class aligned_stack_memory_handler
|
|||||||
* \sa \ref TopicStlContainers.
|
* \sa \ref TopicStlContainers.
|
||||||
*/
|
*/
|
||||||
template<class T>
|
template<class T>
|
||||||
class aligned_allocator
|
class aligned_allocator : public std::allocator<T>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
typedef size_t size_type;
|
typedef size_t size_type;
|
||||||
typedef std::ptrdiff_t difference_type;
|
typedef std::ptrdiff_t difference_type;
|
||||||
typedef T* pointer;
|
typedef T* pointer;
|
||||||
typedef const T* const_pointer;
|
typedef const T* const_pointer;
|
||||||
typedef T& reference;
|
typedef T& reference;
|
||||||
typedef const T& const_reference;
|
typedef const T& const_reference;
|
||||||
typedef T value_type;
|
typedef T value_type;
|
||||||
|
|
||||||
template<class U>
|
template<class U>
|
||||||
struct rebind
|
struct rebind
|
||||||
{
|
{
|
||||||
typedef aligned_allocator<U> other;
|
typedef aligned_allocator<U> other;
|
||||||
};
|
};
|
||||||
|
|
||||||
pointer address( reference value ) const
|
aligned_allocator() : std::allocator<T>() {}
|
||||||
{
|
|
||||||
return &value;
|
|
||||||
}
|
|
||||||
|
|
||||||
const_pointer address( const_reference value ) const
|
aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
|
||||||
{
|
|
||||||
return &value;
|
|
||||||
}
|
|
||||||
|
|
||||||
aligned_allocator()
|
template<class U>
|
||||||
{
|
aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
|
||||||
}
|
|
||||||
|
|
||||||
aligned_allocator( const aligned_allocator& )
|
~aligned_allocator() {}
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class U>
|
pointer allocate(size_type num, const void* /*hint*/ = 0)
|
||||||
aligned_allocator( const aligned_allocator<U>& )
|
{
|
||||||
{
|
internal::check_size_for_overflow<T>(num);
|
||||||
}
|
return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
|
||||||
|
}
|
||||||
|
|
||||||
~aligned_allocator()
|
void deallocate(pointer p, size_type /*num*/)
|
||||||
{
|
{
|
||||||
}
|
internal::aligned_free(p);
|
||||||
|
}
|
||||||
size_type max_size() const
|
|
||||||
{
|
|
||||||
return (std::numeric_limits<size_type>::max)();
|
|
||||||
}
|
|
||||||
|
|
||||||
pointer allocate( size_type num, const void* hint = 0 )
|
|
||||||
{
|
|
||||||
EIGEN_UNUSED_VARIABLE(hint);
|
|
||||||
internal::check_size_for_overflow<T>(num);
|
|
||||||
return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
void construct( pointer p, const T& value )
|
|
||||||
{
|
|
||||||
::new( p ) T( value );
|
|
||||||
}
|
|
||||||
|
|
||||||
#if (__cplusplus >= 201103L)
|
|
||||||
template <typename U, typename... Args>
|
|
||||||
void construct( U* u, Args&&... args)
|
|
||||||
{
|
|
||||||
::new( static_cast<void*>(u) ) U( std::forward<Args>( args )... );
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void destroy( pointer p )
|
|
||||||
{
|
|
||||||
p->~T();
|
|
||||||
}
|
|
||||||
|
|
||||||
#if (__cplusplus >= 201103L)
|
|
||||||
template <typename U>
|
|
||||||
void destroy( U* u )
|
|
||||||
{
|
|
||||||
u->~U();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void deallocate( pointer p, size_type /*num*/ )
|
|
||||||
{
|
|
||||||
internal::aligned_free( p );
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator!=(const aligned_allocator<T>& ) const
|
|
||||||
{ return false; }
|
|
||||||
|
|
||||||
bool operator==(const aligned_allocator<T>& ) const
|
|
||||||
{ return true; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
//---------- Cache sizes ----------
|
//---------- Cache sizes ----------
|
||||||
@ -823,9 +767,9 @@ namespace internal {
|
|||||||
|
|
||||||
#ifdef EIGEN_CPUID
|
#ifdef EIGEN_CPUID
|
||||||
|
|
||||||
inline bool cpuid_is_vendor(int abcd[4], const char* vendor)
|
inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
|
||||||
{
|
{
|
||||||
return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2];
|
return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
|
inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
|
||||||
@ -967,13 +911,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3)
|
|||||||
{
|
{
|
||||||
#ifdef EIGEN_CPUID
|
#ifdef EIGEN_CPUID
|
||||||
int abcd[4];
|
int abcd[4];
|
||||||
|
const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
|
||||||
|
const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
|
||||||
|
const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
|
||||||
|
|
||||||
// identify the CPU vendor
|
// identify the CPU vendor
|
||||||
EIGEN_CPUID(abcd,0x0,0);
|
EIGEN_CPUID(abcd,0x0,0);
|
||||||
int max_std_funcs = abcd[1];
|
int max_std_funcs = abcd[1];
|
||||||
if(cpuid_is_vendor(abcd,"GenuineIntel"))
|
if(cpuid_is_vendor(abcd,GenuineIntel))
|
||||||
queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
|
queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
|
||||||
else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!"))
|
else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
|
||||||
queryCacheSizes_amd(l1,l2,l3);
|
queryCacheSizes_amd(l1,l2,l3);
|
||||||
else
|
else
|
||||||
// by default let's use Intel's API
|
// by default let's use Intel's API
|
||||||
|
@ -101,6 +101,7 @@ template<typename T> struct packet_traits;
|
|||||||
template<typename T> struct unpacket_traits
|
template<typename T> struct unpacket_traits
|
||||||
{
|
{
|
||||||
typedef T type;
|
typedef T type;
|
||||||
|
typedef T half;
|
||||||
enum {size=1};
|
enum {size=1};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -137,7 +138,7 @@ class compute_matrix_flags
|
|||||||
((Options&DontAlign)==0)
|
((Options&DontAlign)==0)
|
||||||
&& (
|
&& (
|
||||||
#if EIGEN_ALIGN_STATICALLY
|
#if EIGEN_ALIGN_STATICALLY
|
||||||
((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
|
((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
|
||||||
#else
|
#else
|
||||||
0
|
0
|
||||||
#endif
|
#endif
|
||||||
|
@ -275,10 +275,11 @@ template<typename _MatrixType> class EigenSolver
|
|||||||
*/
|
*/
|
||||||
EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
|
EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
|
||||||
|
|
||||||
|
/** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
|
||||||
ComputationInfo info() const
|
ComputationInfo info() const
|
||||||
{
|
{
|
||||||
eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
|
eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
|
||||||
return m_realSchur.info();
|
return m_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \brief Sets the maximum number of iterations allowed. */
|
/** \brief Sets the maximum number of iterations allowed. */
|
||||||
@ -302,6 +303,7 @@ template<typename _MatrixType> class EigenSolver
|
|||||||
EigenvalueType m_eivalues;
|
EigenvalueType m_eivalues;
|
||||||
bool m_isInitialized;
|
bool m_isInitialized;
|
||||||
bool m_eigenvectorsOk;
|
bool m_eigenvectorsOk;
|
||||||
|
ComputationInfo m_info;
|
||||||
RealSchur<MatrixType> m_realSchur;
|
RealSchur<MatrixType> m_realSchur;
|
||||||
MatrixType m_matT;
|
MatrixType m_matT;
|
||||||
|
|
||||||
@ -366,12 +368,16 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
|
|||||||
{
|
{
|
||||||
using std::sqrt;
|
using std::sqrt;
|
||||||
using std::abs;
|
using std::abs;
|
||||||
|
using std::max;
|
||||||
|
using numext::isfinite;
|
||||||
eigen_assert(matrix.cols() == matrix.rows());
|
eigen_assert(matrix.cols() == matrix.rows());
|
||||||
|
|
||||||
// Reduce to real Schur form.
|
// Reduce to real Schur form.
|
||||||
m_realSchur.compute(matrix, computeEigenvectors);
|
m_realSchur.compute(matrix, computeEigenvectors);
|
||||||
|
|
||||||
if (m_realSchur.info() == Success)
|
m_info = m_realSchur.info();
|
||||||
|
|
||||||
|
if (m_info == Success)
|
||||||
{
|
{
|
||||||
m_matT = m_realSchur.matrixT();
|
m_matT = m_realSchur.matrixT();
|
||||||
if (computeEigenvectors)
|
if (computeEigenvectors)
|
||||||
@ -385,14 +391,40 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
|
|||||||
if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0))
|
if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0))
|
||||||
{
|
{
|
||||||
m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
|
m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
|
||||||
|
if(!isfinite(m_eivalues.coeffRef(i)))
|
||||||
|
{
|
||||||
|
m_isInitialized = true;
|
||||||
|
m_eigenvectorsOk = false;
|
||||||
|
m_info = NumericalIssue;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
|
Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
|
||||||
Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
|
Scalar z;
|
||||||
|
// Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
|
||||||
|
// without overflow
|
||||||
|
{
|
||||||
|
Scalar t0 = m_matT.coeff(i+1, i);
|
||||||
|
Scalar t1 = m_matT.coeff(i, i+1);
|
||||||
|
Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1)));
|
||||||
|
t0 /= maxval;
|
||||||
|
t1 /= maxval;
|
||||||
|
Scalar p0 = p/maxval;
|
||||||
|
z = maxval * sqrt(abs(p0 * p0 + t0 * t1));
|
||||||
|
}
|
||||||
|
|
||||||
m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
|
m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
|
||||||
m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
|
m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
|
||||||
|
if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1))))
|
||||||
|
{
|
||||||
|
m_isInitialized = true;
|
||||||
|
m_eigenvectorsOk = false;
|
||||||
|
m_info = NumericalIssue;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
i += 2;
|
i += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -581,7 +613,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen
|
eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +77,9 @@ public:
|
|||||||
* represents an invalid rotation. */
|
* represents an invalid rotation. */
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
|
inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
|
||||||
/** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
|
/** Constructs and initialize the angle-axis rotation from a quaternion \a q.
|
||||||
|
* This function implicitly normalizes the quaternion \a q.
|
||||||
|
*/
|
||||||
template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
|
template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
|
||||||
/** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
|
/** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
@ -149,29 +151,27 @@ typedef AngleAxis<float> AngleAxisf;
|
|||||||
typedef AngleAxis<double> AngleAxisd;
|
typedef AngleAxis<double> AngleAxisd;
|
||||||
|
|
||||||
/** Set \c *this from a \b unit quaternion.
|
/** Set \c *this from a \b unit quaternion.
|
||||||
* The axis is normalized.
|
* The resulting axis is normalized.
|
||||||
*
|
*
|
||||||
* \warning As any other method dealing with quaternion, if the input quaternion
|
* This function implicitly normalizes the quaternion \a q.
|
||||||
* is not normalized then the result is undefined.
|
|
||||||
*/
|
*/
|
||||||
template<typename Scalar>
|
template<typename Scalar>
|
||||||
template<typename QuatDerived>
|
template<typename QuatDerived>
|
||||||
AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
|
AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
|
||||||
{
|
{
|
||||||
using std::acos;
|
using std::atan2;
|
||||||
EIGEN_USING_STD_MATH(min);
|
Scalar n = q.vec().norm();
|
||||||
EIGEN_USING_STD_MATH(max);
|
if(n<NumTraits<Scalar>::epsilon())
|
||||||
using std::sqrt;
|
n = q.vec().stableNorm();
|
||||||
Scalar n2 = q.vec().squaredNorm();
|
if (n > Scalar(0))
|
||||||
if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
|
|
||||||
{
|
{
|
||||||
m_angle = 0;
|
m_angle = Scalar(2)*atan2(n, q.w());
|
||||||
m_axis << 1, 0, 0;
|
m_axis = q.vec() / n;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1)));
|
m_angle = 0;
|
||||||
m_axis = q.vec() / sqrt(n2);
|
m_axis << 1, 0, 0;
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
@ -587,7 +587,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
|
|||||||
// which yields a singular value problem
|
// which yields a singular value problem
|
||||||
if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
|
if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
|
||||||
{
|
{
|
||||||
c = max<Scalar>(c,-1);
|
c = (max)(c,Scalar(-1));
|
||||||
Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
|
Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
|
||||||
JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
|
JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
|
||||||
Vector3 axis = svd.matrixV().col(2);
|
Vector3 axis = svd.matrixV().col(2);
|
||||||
|
@ -194,9 +194,9 @@ public:
|
|||||||
/** type of the matrix used to represent the linear part of the transformation */
|
/** type of the matrix used to represent the linear part of the transformation */
|
||||||
typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
|
typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
|
||||||
/** type of read/write reference to the linear part of the transformation */
|
/** type of read/write reference to the linear part of the transformation */
|
||||||
typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact)> LinearPart;
|
typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
|
||||||
/** type of read reference to the linear part of the transformation */
|
/** type of read reference to the linear part of the transformation */
|
||||||
typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact)> ConstLinearPart;
|
typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
|
||||||
/** type of read/write reference to the affine part of the transformation */
|
/** type of read/write reference to the affine part of the transformation */
|
||||||
typedef typename internal::conditional<int(Mode)==int(AffineCompact),
|
typedef typename internal::conditional<int(Mode)==int(AffineCompact),
|
||||||
MatrixType&,
|
MatrixType&,
|
||||||
|
@ -113,7 +113,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
|
|||||||
const Index n = src.cols(); // number of measurements
|
const Index n = src.cols(); // number of measurements
|
||||||
|
|
||||||
// required for demeaning ...
|
// required for demeaning ...
|
||||||
const RealScalar one_over_n = 1 / static_cast<RealScalar>(n);
|
const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n);
|
||||||
|
|
||||||
// computation of mean
|
// computation of mean
|
||||||
const VectorType src_mean = src.rowwise().sum() * one_over_n;
|
const VectorType src_mean = src.rowwise().sum() * one_over_n;
|
||||||
@ -136,16 +136,16 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
|
|||||||
|
|
||||||
// Eq. (39)
|
// Eq. (39)
|
||||||
VectorType S = VectorType::Ones(m);
|
VectorType S = VectorType::Ones(m);
|
||||||
if (sigma.determinant()<0) S(m-1) = -1;
|
if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
|
||||||
|
|
||||||
// Eq. (40) and (43)
|
// Eq. (40) and (43)
|
||||||
const VectorType& d = svd.singularValues();
|
const VectorType& d = svd.singularValues();
|
||||||
Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
|
Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
|
||||||
if (rank == m-1) {
|
if (rank == m-1) {
|
||||||
if ( svd.matrixU().determinant() * svd.matrixV().determinant() > 0 ) {
|
if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
|
||||||
Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
|
Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
|
||||||
} else {
|
} else {
|
||||||
const Scalar s = S(m-1); S(m-1) = -1;
|
const Scalar s = S(m-1); S(m-1) = Scalar(-1);
|
||||||
Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
|
Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
|
||||||
S(m-1) = s;
|
S(m-1) = s;
|
||||||
}
|
}
|
||||||
@ -156,7 +156,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
|
|||||||
if (with_scaling)
|
if (with_scaling)
|
||||||
{
|
{
|
||||||
// Eq. (42)
|
// Eq. (42)
|
||||||
const Scalar c = 1/src_var * svd.singularValues().dot(S);
|
const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
|
||||||
|
|
||||||
// Eq. (41)
|
// Eq. (41)
|
||||||
Rt.col(m).head(m) = dst_mean;
|
Rt.col(m).head(m) = dst_mean;
|
||||||
|
@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
|
|||||||
typedef typename MatrixType::Index Index;
|
typedef typename MatrixType::Index Index;
|
||||||
enum { TFactorSize = MatrixType::ColsAtCompileTime };
|
enum { TFactorSize = MatrixType::ColsAtCompileTime };
|
||||||
Index nbVecs = vectors.cols();
|
Index nbVecs = vectors.cols();
|
||||||
Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize> T(nbVecs,nbVecs);
|
Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
|
||||||
make_block_householder_triangular_factor(T, vectors, hCoeffs);
|
make_block_householder_triangular_factor(T, vectors, hCoeffs);
|
||||||
|
|
||||||
const TriangularView<const VectorsType, UnitLower>& V(vectors);
|
const TriangularView<const VectorsType, UnitLower>& V(vectors);
|
||||||
|
@ -65,10 +65,10 @@ class DiagonalPreconditioner
|
|||||||
{
|
{
|
||||||
typename MatType::InnerIterator it(mat,j);
|
typename MatType::InnerIterator it(mat,j);
|
||||||
while(it && it.index()!=j) ++it;
|
while(it && it.index()!=j) ++it;
|
||||||
if(it && it.index()==j)
|
if(it && it.index()==j && it.value()!=Scalar(0))
|
||||||
m_invdiag(j) = Scalar(1)/it.value();
|
m_invdiag(j) = Scalar(1)/it.value();
|
||||||
else
|
else
|
||||||
m_invdiag(j) = 0;
|
m_invdiag(j) = Scalar(1);
|
||||||
}
|
}
|
||||||
m_isInitialized = true;
|
m_isInitialized = true;
|
||||||
return *this;
|
return *this;
|
||||||
|
@ -29,10 +29,11 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
|
|||||||
*
|
*
|
||||||
* \param MatrixType the type of the matrix of which we are computing the LU decomposition
|
* \param MatrixType the type of the matrix of which we are computing the LU decomposition
|
||||||
*
|
*
|
||||||
* This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A
|
* This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
|
||||||
* is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q
|
* decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
|
||||||
* are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal
|
* upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
|
||||||
* coefficients) of U are sorted in such a way that any zeros are at the end.
|
* decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
|
||||||
|
* zeros are at the end.
|
||||||
*
|
*
|
||||||
* This decomposition provides the generic approach to solving systems of linear equations, computing
|
* This decomposition provides the generic approach to solving systems of linear equations, computing
|
||||||
* the rank, invertibility, inverse, kernel, and determinant.
|
* the rank, invertibility, inverse, kernel, and determinant.
|
||||||
@ -546,8 +547,8 @@ typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** \returns the matrix represented by the decomposition,
|
/** \returns the matrix represented by the decomposition,
|
||||||
* i.e., it returns the product: P^{-1} L U Q^{-1}.
|
* i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$.
|
||||||
* This function is provided for debug purpose. */
|
* This function is provided for debug purposes. */
|
||||||
template<typename MatrixType>
|
template<typename MatrixType>
|
||||||
MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
|
MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
|
||||||
{
|
{
|
||||||
|
@ -109,7 +109,7 @@ class NaturalOrdering
|
|||||||
* \class COLAMDOrdering
|
* \class COLAMDOrdering
|
||||||
*
|
*
|
||||||
* Functor computing the \em column \em approximate \em minimum \em degree ordering
|
* Functor computing the \em column \em approximate \em minimum \em degree ordering
|
||||||
* The matrix should be in column-major format
|
* The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
|
||||||
*/
|
*/
|
||||||
template<typename Index>
|
template<typename Index>
|
||||||
class COLAMDOrdering
|
class COLAMDOrdering
|
||||||
@ -118,10 +118,14 @@ class COLAMDOrdering
|
|||||||
typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
|
typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
|
||||||
typedef Matrix<Index, Dynamic, 1> IndexVector;
|
typedef Matrix<Index, Dynamic, 1> IndexVector;
|
||||||
|
|
||||||
/** Compute the permutation vector form a sparse matrix */
|
/** Compute the permutation vector \a perm form the sparse matrix \a mat
|
||||||
|
* \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
|
||||||
|
*/
|
||||||
template <typename MatrixType>
|
template <typename MatrixType>
|
||||||
void operator() (const MatrixType& mat, PermutationType& perm)
|
void operator() (const MatrixType& mat, PermutationType& perm)
|
||||||
{
|
{
|
||||||
|
eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
|
||||||
|
|
||||||
Index m = mat.rows();
|
Index m = mat.rows();
|
||||||
Index n = mat.cols();
|
Index n = mat.cols();
|
||||||
Index nnz = mat.nonZeros();
|
Index nnz = mat.nonZeros();
|
||||||
|
@ -415,6 +415,7 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
|
|||||||
JacobiRotation<RealScalar> *j_right)
|
JacobiRotation<RealScalar> *j_right)
|
||||||
{
|
{
|
||||||
using std::sqrt;
|
using std::sqrt;
|
||||||
|
using std::abs;
|
||||||
Matrix<RealScalar,2,2> m;
|
Matrix<RealScalar,2,2> m;
|
||||||
m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
|
m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
|
||||||
numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
|
numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
|
||||||
@ -428,9 +429,11 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
RealScalar u = d / t;
|
RealScalar t2d2 = numext::hypot(t,d);
|
||||||
rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
|
rot1.c() = abs(t)/t2d2;
|
||||||
rot1.s() = rot1.c() * u;
|
rot1.s() = d/t2d2;
|
||||||
|
if(t<RealScalar(0))
|
||||||
|
rot1.s() = -rot1.s();
|
||||||
}
|
}
|
||||||
m.applyOnTheLeft(0,1,rot1);
|
m.applyOnTheLeft(0,1,rot1);
|
||||||
j_right->makeJacobi(m,0,1);
|
j_right->makeJacobi(m,0,1);
|
||||||
|
@ -338,7 +338,10 @@ const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::inner
|
|||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
|
template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
|
||||||
bool OuterVector = (BlockCols==1 && XprType::IsRowMajor) || (BlockRows==1 && !XprType::IsRowMajor)>
|
bool OuterVector = (BlockCols==1 && XprType::IsRowMajor)
|
||||||
|
| // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
|
||||||
|
// revert to || as soon as not needed anymore.
|
||||||
|
(BlockRows==1 && !XprType::IsRowMajor)>
|
||||||
class GenericSparseBlockInnerIteratorImpl;
|
class GenericSparseBlockInnerIteratorImpl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1205,7 +1205,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
|
|||||||
m_data.value(p) = m_data.value(p-1);
|
m_data.value(p) = m_data.value(p-1);
|
||||||
--p;
|
--p;
|
||||||
}
|
}
|
||||||
eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
|
eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
|
||||||
|
|
||||||
m_innerNonZeros[outer]++;
|
m_innerNonZeros[outer]++;
|
||||||
|
|
||||||
|
@ -58,6 +58,7 @@ namespace internal {
|
|||||||
* \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module
|
* \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module
|
||||||
* OrderingMethods \endlink module for the list of built-in and external ordering methods.
|
* OrderingMethods \endlink module for the list of built-in and external ordering methods.
|
||||||
*
|
*
|
||||||
|
* \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
template<typename _MatrixType, typename _OrderingType>
|
template<typename _MatrixType, typename _OrderingType>
|
||||||
@ -77,10 +78,23 @@ class SparseQR
|
|||||||
SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
|
SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
/** Construct a QR factorization of the matrix \a mat.
|
||||||
|
*
|
||||||
|
* \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
|
||||||
|
*
|
||||||
|
* \sa compute()
|
||||||
|
*/
|
||||||
SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
|
SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
|
||||||
{
|
{
|
||||||
compute(mat);
|
compute(mat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Computes the QR factorization of the sparse matrix \a mat.
|
||||||
|
*
|
||||||
|
* \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
|
||||||
|
*
|
||||||
|
* \sa analyzePattern(), factorize()
|
||||||
|
*/
|
||||||
void compute(const MatrixType& mat)
|
void compute(const MatrixType& mat)
|
||||||
{
|
{
|
||||||
analyzePattern(mat);
|
analyzePattern(mat);
|
||||||
@ -255,6 +269,8 @@ class SparseQR
|
|||||||
};
|
};
|
||||||
|
|
||||||
/** \brief Preprocessing step of a QR factorization
|
/** \brief Preprocessing step of a QR factorization
|
||||||
|
*
|
||||||
|
* \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
|
||||||
*
|
*
|
||||||
* In this step, the fill-reducing permutation is computed and applied to the columns of A
|
* In this step, the fill-reducing permutation is computed and applied to the columns of A
|
||||||
* and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
|
* and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
|
||||||
@ -264,6 +280,7 @@ class SparseQR
|
|||||||
template <typename MatrixType, typename OrderingType>
|
template <typename MatrixType, typename OrderingType>
|
||||||
void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
|
void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
|
||||||
{
|
{
|
||||||
|
eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
|
||||||
// Compute the column fill reducing ordering
|
// Compute the column fill reducing ordering
|
||||||
OrderingType ord;
|
OrderingType ord;
|
||||||
ord(mat, m_perm_c);
|
ord(mat, m_perm_c);
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#ifndef EIGEN_STDDEQUE_H
|
#ifndef EIGEN_STDDEQUE_H
|
||||||
#define EIGEN_STDDEQUE_H
|
#define EIGEN_STDDEQUE_H
|
||||||
|
|
||||||
#include "Eigen/src/StlSupport/details.h"
|
#include "details.h"
|
||||||
|
|
||||||
// Define the explicit instantiation (e.g. necessary for the Intel compiler)
|
// Define the explicit instantiation (e.g. necessary for the Intel compiler)
|
||||||
#if defined(__INTEL_COMPILER) || defined(__GNUC__)
|
#if defined(__INTEL_COMPILER) || defined(__GNUC__)
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
#ifndef EIGEN_STDLIST_H
|
#ifndef EIGEN_STDLIST_H
|
||||||
#define EIGEN_STDLIST_H
|
#define EIGEN_STDLIST_H
|
||||||
|
|
||||||
#include "Eigen/src/StlSupport/details.h"
|
#include "details.h"
|
||||||
|
|
||||||
// Define the explicit instantiation (e.g. necessary for the Intel compiler)
|
// Define the explicit instantiation (e.g. necessary for the Intel compiler)
|
||||||
#if defined(__INTEL_COMPILER) || defined(__GNUC__)
|
#if defined(__INTEL_COMPILER) || defined(__GNUC__)
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#ifndef EIGEN_STDVECTOR_H
|
#ifndef EIGEN_STDVECTOR_H
|
||||||
#define EIGEN_STDVECTOR_H
|
#define EIGEN_STDVECTOR_H
|
||||||
|
|
||||||
#include "Eigen/src/StlSupport/details.h"
|
#include "details.h"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This section contains a convenience MACRO which allows an easy specialization of
|
* This section contains a convenience MACRO which allows an easy specialization of
|
||||||
|
@ -141,6 +141,18 @@ tan() const
|
|||||||
return derived();
|
return derived();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** \returns an expression of the coefficient-wise arc tan of *this.
|
||||||
|
*
|
||||||
|
* Example: \include Cwise_atan.cpp
|
||||||
|
* Output: \verbinclude Cwise_atan.out
|
||||||
|
*
|
||||||
|
* \sa cos(), sin(), tan()
|
||||||
|
*/
|
||||||
|
inline const CwiseUnaryOp<internal::scalar_atan_op<Scalar>, Derived>
|
||||||
|
atan() const
|
||||||
|
{
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
/** \returns an expression of the coefficient-wise power of *this to the given exponent.
|
/** \returns an expression of the coefficient-wise power of *this to the given exponent.
|
||||||
*
|
*
|
||||||
|
3
README.md
Normal file
3
README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
**Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
|
||||||
|
|
||||||
|
For more information go to http://eigen.tuxfamily.org/.
|
@ -2,6 +2,14 @@
|
|||||||
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
|
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
|
||||||
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
|
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
|
||||||
|
|
||||||
|
// Compilation options:
|
||||||
|
//
|
||||||
|
// -DSCALAR=std::complex<double>
|
||||||
|
// -DSCALARA=double or -DSCALARB=double
|
||||||
|
// -DHAVE_BLAS
|
||||||
|
// -DDECOUPLED
|
||||||
|
//
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <Eigen/Core>
|
#include <Eigen/Core>
|
||||||
#include <bench/BenchTimer.h>
|
#include <bench/BenchTimer.h>
|
||||||
@ -14,10 +22,18 @@ using namespace Eigen;
|
|||||||
#define SCALAR float
|
#define SCALAR float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef SCALARA
|
||||||
|
#define SCALARA SCALAR
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef SCALARB
|
||||||
|
#define SCALARB SCALAR
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef SCALAR Scalar;
|
typedef SCALAR Scalar;
|
||||||
typedef NumTraits<Scalar>::Real RealScalar;
|
typedef NumTraits<Scalar>::Real RealScalar;
|
||||||
typedef Matrix<RealScalar,Dynamic,Dynamic> A;
|
typedef Matrix<SCALARA,Dynamic,Dynamic> A;
|
||||||
typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
|
typedef Matrix<SCALARB,Dynamic,Dynamic> B;
|
||||||
typedef Matrix<Scalar,Dynamic,Dynamic> C;
|
typedef Matrix<Scalar,Dynamic,Dynamic> C;
|
||||||
typedef Matrix<RealScalar,Dynamic,Dynamic> M;
|
typedef Matrix<RealScalar,Dynamic,Dynamic> M;
|
||||||
|
|
||||||
@ -129,35 +145,62 @@ int main(int argc, char ** argv)
|
|||||||
int tries = 2; // number of tries, we keep the best
|
int tries = 2; // number of tries, we keep the best
|
||||||
|
|
||||||
int s = 2048;
|
int s = 2048;
|
||||||
|
int m = s;
|
||||||
|
int n = s;
|
||||||
|
int p = s;
|
||||||
int cache_size = -1;
|
int cache_size = -1;
|
||||||
|
|
||||||
bool need_help = false;
|
bool need_help = false;
|
||||||
for (int i=1; i<argc; ++i)
|
for (int i=1; i<argc;)
|
||||||
{
|
{
|
||||||
if(argv[i][0]=='s')
|
if(argv[i][0]=='-')
|
||||||
s = atoi(argv[i]+1);
|
{
|
||||||
else if(argv[i][0]=='c')
|
if(argv[i][1]=='s')
|
||||||
cache_size = atoi(argv[i]+1);
|
{
|
||||||
else if(argv[i][0]=='t')
|
++i;
|
||||||
tries = atoi(argv[i]+1);
|
s = atoi(argv[i++]);
|
||||||
else if(argv[i][0]=='p')
|
m = n = p = s;
|
||||||
rep = atoi(argv[i]+1);
|
if(argv[i][0]!='-')
|
||||||
|
{
|
||||||
|
n = atoi(argv[i++]);
|
||||||
|
p = atoi(argv[i++]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(argv[i][1]=='c')
|
||||||
|
{
|
||||||
|
++i;
|
||||||
|
cache_size = atoi(argv[i++]);
|
||||||
|
}
|
||||||
|
else if(argv[i][1]=='t')
|
||||||
|
{
|
||||||
|
++i;
|
||||||
|
tries = atoi(argv[i++]);
|
||||||
|
}
|
||||||
|
else if(argv[i][1]=='p')
|
||||||
|
{
|
||||||
|
++i;
|
||||||
|
rep = atoi(argv[i++]);
|
||||||
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
need_help = true;
|
need_help = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(need_help)
|
if(need_help)
|
||||||
{
|
{
|
||||||
std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
|
std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n";
|
||||||
|
std::cout << " <matrix sizes> : size\n";
|
||||||
|
std::cout << " <matrix sizes> : rows columns depth\n";
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(cache_size>0)
|
if(cache_size>0)
|
||||||
setCpuCacheSizes(cache_size,96*cache_size);
|
setCpuCacheSizes(cache_size,96*cache_size);
|
||||||
|
|
||||||
int m = s;
|
|
||||||
int n = s;
|
|
||||||
int p = s;
|
|
||||||
A a(m,p); a.setRandom();
|
A a(m,p); a.setRandom();
|
||||||
B b(p,n); b.setRandom();
|
B b(p,n); b.setRandom();
|
||||||
C c(m,n); c.setOnes();
|
C c(m,n); c.setOnes();
|
||||||
@ -172,6 +215,7 @@ int main(int argc, char ** argv)
|
|||||||
|
|
||||||
// check the parallel product is correct
|
// check the parallel product is correct
|
||||||
#if defined EIGEN_HAS_OPENMP
|
#if defined EIGEN_HAS_OPENMP
|
||||||
|
Eigen::initParallel();
|
||||||
int procs = omp_get_max_threads();
|
int procs = omp_get_max_threads();
|
||||||
if(procs>1)
|
if(procs>1)
|
||||||
{
|
{
|
||||||
@ -188,11 +232,20 @@ int main(int argc, char ** argv)
|
|||||||
#elif defined HAVE_BLAS
|
#elif defined HAVE_BLAS
|
||||||
blas_gemm(a,b,r);
|
blas_gemm(a,b,r);
|
||||||
c.noalias() += a * b;
|
c.noalias() += a * b;
|
||||||
if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
|
if(!r.isApprox(c)) {
|
||||||
|
std::cout << r - c << "\n";
|
||||||
|
std::cerr << "Warning, your product is crap!\n\n";
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
gemm(a,b,c);
|
if(1.*m*n*p<2000.*2000*2000)
|
||||||
r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
|
{
|
||||||
if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
|
gemm(a,b,c);
|
||||||
|
r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
|
||||||
|
if(!r.isApprox(c)) {
|
||||||
|
std::cout << r - c << "\n";
|
||||||
|
std::cerr << "Warning, your product is crap!\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_BLAS
|
#ifdef HAVE_BLAS
|
||||||
@ -214,7 +267,7 @@ int main(int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
BenchTimer tmono;
|
BenchTimer tmono;
|
||||||
omp_set_num_threads(1);
|
omp_set_num_threads(1);
|
||||||
Eigen::internal::setNbThreads(1);
|
Eigen::setNbThreads(1);
|
||||||
c = rc;
|
c = rc;
|
||||||
BENCH(tmono, tries, rep, gemm(a,b,c));
|
BENCH(tmono, tries, rep, gemm(a,b,c));
|
||||||
std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
|
std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
|
||||||
@ -223,6 +276,15 @@ int main(int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if(1.*m*n*p<30*30*30)
|
||||||
|
{
|
||||||
|
BenchTimer tmt;
|
||||||
|
c = rc;
|
||||||
|
BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
|
||||||
|
std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
|
||||||
|
std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef DECOUPLED
|
#ifdef DECOUPLED
|
||||||
if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
|
if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
|
||||||
{
|
{
|
||||||
|
@ -104,6 +104,7 @@ add_subdirectory(libs/mtl4)
|
|||||||
add_subdirectory(libs/blitz)
|
add_subdirectory(libs/blitz)
|
||||||
add_subdirectory(libs/tvmet)
|
add_subdirectory(libs/tvmet)
|
||||||
add_subdirectory(libs/STL)
|
add_subdirectory(libs/STL)
|
||||||
|
add_subdirectory(libs/blaze)
|
||||||
|
|
||||||
add_subdirectory(data)
|
add_subdirectory(data)
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ class Action_axpby {
|
|||||||
public :
|
public :
|
||||||
|
|
||||||
// Ctor
|
// Ctor
|
||||||
Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95)
|
Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size)
|
||||||
{
|
{
|
||||||
MESSAGE("Action_axpby Ctor");
|
MESSAGE("Action_axpby Ctor");
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ public :
|
|||||||
|
|
||||||
// Ctor
|
// Ctor
|
||||||
|
|
||||||
Action_axpy( int size ):_size(size),_coef(1.0)
|
Action_axpy( int size ):_coef(1.0),_size(size)
|
||||||
{
|
{
|
||||||
MESSAGE("Action_axpy Ctor");
|
MESSAGE("Action_axpy Ctor");
|
||||||
|
|
||||||
|
@ -4,10 +4,7 @@ if (ATLAS_LIBRARIES)
|
|||||||
endif (ATLAS_LIBRARIES)
|
endif (ATLAS_LIBRARIES)
|
||||||
|
|
||||||
find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
||||||
find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
||||||
|
|
||||||
find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
|
||||||
find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
|
||||||
|
|
||||||
find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
||||||
find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
||||||
@ -22,14 +19,14 @@ find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
|
|||||||
|
|
||||||
if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
|
if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
|
||||||
|
|
||||||
set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS} ${ATLAS_F77BLAS} ${ATLAS_LIB})
|
set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_LIB})
|
||||||
|
|
||||||
# search the default lapack lib link to it
|
# search the default lapack lib link to it
|
||||||
find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64)
|
find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64)
|
||||||
find_library(ATLAS_REFERENCE_LAPACK NAMES lapack)
|
find_library(ATLAS_REFERENCE_LAPACK NAMES lapack)
|
||||||
if(ATLAS_REFERENCE_LAPACK)
|
# if(ATLAS_REFERENCE_LAPACK)
|
||||||
set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
|
# set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
|
||||||
endif()
|
# endif()
|
||||||
|
|
||||||
endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
|
endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
|
||||||
|
|
||||||
|
31
bench/btl/cmake/FindBLAZE.cmake
Normal file
31
bench/btl/cmake/FindBLAZE.cmake
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# - Try to find eigen2 headers
|
||||||
|
# Once done this will define
|
||||||
|
#
|
||||||
|
# BLAZE_FOUND - system has blaze lib
|
||||||
|
# BLAZE_INCLUDE_DIR - the blaze include directory
|
||||||
|
#
|
||||||
|
# Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||||
|
# Adapted from FindEigen.cmake:
|
||||||
|
# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
|
||||||
|
# Redistribution and use is allowed according to the terms of the BSD license.
|
||||||
|
# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
|
||||||
|
|
||||||
|
if (BLAZE_INCLUDE_DIR)
|
||||||
|
|
||||||
|
# in cache already
|
||||||
|
set(BLAZE_FOUND TRUE)
|
||||||
|
|
||||||
|
else (BLAZE_INCLUDE_DIR)
|
||||||
|
|
||||||
|
find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h
|
||||||
|
PATHS
|
||||||
|
${INCLUDE_INSTALL_DIR}
|
||||||
|
)
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR)
|
||||||
|
|
||||||
|
mark_as_advanced(BLAZE_INCLUDE_DIR)
|
||||||
|
|
||||||
|
endif(BLAZE_INCLUDE_DIR)
|
||||||
|
|
@ -1,15 +0,0 @@
|
|||||||
|
|
||||||
if (GOTO_LIBRARIES)
|
|
||||||
set(GOTO_FIND_QUIETLY TRUE)
|
|
||||||
endif (GOTO_LIBRARIES)
|
|
||||||
|
|
||||||
find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR})
|
|
||||||
|
|
||||||
if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
|
|
||||||
set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
include(FindPackageHandleStandardArgs)
|
|
||||||
find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES)
|
|
||||||
|
|
||||||
mark_as_advanced(GOTO_LIBRARIES)
|
|
@ -1,25 +0,0 @@
|
|||||||
|
|
||||||
if (GOTO2_LIBRARIES)
|
|
||||||
set(GOTO2_FIND_QUIETLY TRUE)
|
|
||||||
endif (GOTO2_LIBRARIES)
|
|
||||||
#
|
|
||||||
# find_path(GOTO_INCLUDES
|
|
||||||
# NAMES
|
|
||||||
# cblas.h
|
|
||||||
# PATHS
|
|
||||||
# $ENV{GOTODIR}/include
|
|
||||||
# ${INCLUDE_INSTALL_DIR}
|
|
||||||
# )
|
|
||||||
|
|
||||||
find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
|
|
||||||
find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
|
|
||||||
|
|
||||||
if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
|
|
||||||
set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
include(FindPackageHandleStandardArgs)
|
|
||||||
find_package_handle_standard_args(GOTO2 DEFAULT_MSG
|
|
||||||
GOTO2_LIBRARIES)
|
|
||||||
|
|
||||||
mark_as_advanced(GOTO2_LIBRARIES)
|
|
17
bench/btl/cmake/FindOPENBLAS.cmake
Normal file
17
bench/btl/cmake/FindOPENBLAS.cmake
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
if (OPENBLAS_LIBRARIES)
|
||||||
|
set(OPENBLAS_FIND_QUIETLY TRUE)
|
||||||
|
endif (OPENBLAS_LIBRARIES)
|
||||||
|
|
||||||
|
find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
|
||||||
|
find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
|
||||||
|
|
||||||
|
if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
|
||||||
|
set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(OPENBLAS DEFAULT_MSG
|
||||||
|
OPENBLAS_LIBRARIES)
|
||||||
|
|
||||||
|
mark_as_advanced(OPENBLAS_LIBRARIES)
|
@ -1,19 +1,19 @@
|
|||||||
aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000
|
aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000
|
||||||
ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000
|
ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000
|
||||||
atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000
|
atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000
|
||||||
axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000
|
axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000
|
||||||
axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000
|
axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000
|
||||||
matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000
|
matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000
|
||||||
matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000
|
matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000
|
||||||
trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000
|
trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000
|
||||||
trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000
|
trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000
|
||||||
trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000
|
trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000
|
||||||
cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000
|
cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000
|
||||||
complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000
|
complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000
|
||||||
partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000
|
partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000
|
||||||
tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000
|
tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000
|
||||||
hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000
|
hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000
|
||||||
symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000
|
symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000
|
||||||
syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000
|
syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000
|
||||||
ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000
|
ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000
|
||||||
rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000
|
rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000
|
@ -10,7 +10,7 @@ ublas ; with lines lw 3 lt 1 lc rgbcolor "#00b7ff"
|
|||||||
mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847"
|
mtl4 ; with lines lw 3 lt 1 lc rgbcolor "#d18847"
|
||||||
blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
|
blitz ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
|
||||||
F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c"
|
F77 ; with lines lw 3 lt 3 lc rgbcolor "#e6e64c"
|
||||||
GOTO ; with lines lw 3 lt 3 lc rgbcolor "#C05600"
|
OPENBLAS ; with lines lw 3 lt 1 lc rgbcolor "#C05600"
|
||||||
GOTO2 ; with lines lw 3 lt 1 lc rgbcolor "#C05600"
|
|
||||||
C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96"
|
C ; with lines lw 3 lt 3 lc rgbcolor "#e6bd96"
|
||||||
ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c"
|
ACML ; with lines lw 2 lt 3 lc rgbcolor "#e6e64c"
|
||||||
|
blaze ; with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
|
||||||
|
@ -33,7 +33,7 @@
|
|||||||
// min matrix size for matrix vector product bench
|
// min matrix size for matrix vector product bench
|
||||||
#define MIN_MV 5
|
#define MIN_MV 5
|
||||||
// max matrix size for matrix vector product bench
|
// max matrix size for matrix vector product bench
|
||||||
#define MAX_MV 3000
|
#define MAX_MV 5000
|
||||||
// min matrix size for matrix matrix product bench
|
// min matrix size for matrix matrix product bench
|
||||||
#define MIN_MM 5
|
#define MIN_MM 5
|
||||||
// max matrix size for matrix matrix product bench
|
// max matrix size for matrix matrix product bench
|
||||||
|
@ -176,7 +176,7 @@ public:
|
|||||||
if (_config!=NULL)
|
if (_config!=NULL)
|
||||||
{
|
{
|
||||||
std::vector<BtlString> config = BtlString(_config).split(" \t\n");
|
std::vector<BtlString> config = BtlString(_config).split(" \t\n");
|
||||||
for (int i = 0; i<config.size(); i++)
|
for (unsigned int i = 0; i<config.size(); i++)
|
||||||
{
|
{
|
||||||
if (config[i].beginsWith("-a"))
|
if (config[i].beginsWith("-a"))
|
||||||
{
|
{
|
||||||
@ -224,7 +224,7 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
BtlString name(_name);
|
BtlString name(_name);
|
||||||
for (int i=0; i<Instance.m_selectedActionNames.size(); ++i)
|
for (unsigned int i=0; i<Instance.m_selectedActionNames.size(); ++i)
|
||||||
if (name.contains(Instance.m_selectedActionNames[i]))
|
if (name.contains(Instance.m_selectedActionNames[i]))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -30,23 +30,23 @@ double simple_function(int index_i, int index_j)
|
|||||||
return index_i+index_j;
|
return index_i+index_j;
|
||||||
}
|
}
|
||||||
|
|
||||||
double pseudo_random(int index)
|
double pseudo_random(int /*index*/)
|
||||||
{
|
{
|
||||||
return std::rand()/double(RAND_MAX);
|
return std::rand()/double(RAND_MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
double pseudo_random(int index_i, int index_j)
|
double pseudo_random(int /*index_i*/, int /*index_j*/)
|
||||||
{
|
{
|
||||||
return std::rand()/double(RAND_MAX);
|
return std::rand()/double(RAND_MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
double null_function(int index)
|
double null_function(int /*index*/)
|
||||||
{
|
{
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
double null_function(int index_i, int index_j)
|
double null_function(int /*index_i*/, int /*index_j*/)
|
||||||
{
|
{
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,7 @@
|
|||||||
#include "size_log.hh"
|
#include "size_log.hh"
|
||||||
|
|
||||||
template<class Vector>
|
template<class Vector>
|
||||||
void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X)
|
void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X)
|
||||||
{
|
{
|
||||||
int ten=10;
|
int ten=10;
|
||||||
int nine=9;
|
int nine=9;
|
||||||
|
@ -18,27 +18,14 @@ if (MKL_FOUND)
|
|||||||
endif (MKL_FOUND)
|
endif (MKL_FOUND)
|
||||||
|
|
||||||
|
|
||||||
find_package(GOTO2)
|
find_package(OPENBLAS)
|
||||||
if (GOTO2_FOUND)
|
if (OPENBLAS_FOUND)
|
||||||
btl_add_bench(btl_goto2 main.cpp)
|
btl_add_bench(btl_openblas main.cpp)
|
||||||
if(BUILD_btl_goto2)
|
if(BUILD_btl_openblas)
|
||||||
target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} )
|
target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} )
|
||||||
set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2")
|
set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS")
|
||||||
endif(BUILD_btl_goto2)
|
endif(BUILD_btl_openblas)
|
||||||
endif (GOTO2_FOUND)
|
endif (OPENBLAS_FOUND)
|
||||||
|
|
||||||
find_package(GOTO)
|
|
||||||
if (GOTO_FOUND)
|
|
||||||
if(GOTO2_FOUND)
|
|
||||||
btl_add_bench(btl_goto main.cpp OFF)
|
|
||||||
else()
|
|
||||||
btl_add_bench(btl_goto main.cpp)
|
|
||||||
endif()
|
|
||||||
if(BUILD_btl_goto)
|
|
||||||
target_link_libraries(btl_goto ${GOTO_LIBRARIES} )
|
|
||||||
set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO")
|
|
||||||
endif(BUILD_btl_goto)
|
|
||||||
endif (GOTO_FOUND)
|
|
||||||
|
|
||||||
find_package(ACML)
|
find_package(ACML)
|
||||||
if (ACML_FOUND)
|
if (ACML_FOUND)
|
||||||
|
@ -75,7 +75,6 @@ public :
|
|||||||
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
||||||
int N2 = N*N;
|
int N2 = N*N;
|
||||||
BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
|
BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
|
||||||
char uplo = 'L';
|
|
||||||
int info = 0;
|
int info = 0;
|
||||||
int * ipiv = (int*)alloca(sizeof(int)*N);
|
int * ipiv = (int*)alloca(sizeof(int)*N);
|
||||||
BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info);
|
BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info);
|
||||||
@ -92,7 +91,7 @@ public :
|
|||||||
BLAS_FUNC(trsm)(&right, &lower, ¬rans, &nonunit, &N, &N, &fone, L, &N, X, &N);
|
BLAS_FUNC(trsm)(&right, &lower, ¬rans, &nonunit, &N, &N, &fone, L, &N, X, &N);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){
|
static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){
|
||||||
BLAS_FUNC(trmm)(&left, &lower, ¬rans,&nonunit, &N,&N,&fone,A,&N,B,&N);
|
BLAS_FUNC(trmm)(&left, &lower, ¬rans,&nonunit, &N,&N,&fone,A,&N,B,&N);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,7 +100,6 @@ public :
|
|||||||
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
||||||
int N2 = N*N;
|
int N2 = N*N;
|
||||||
BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
|
BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
|
||||||
char uplo = 'L';
|
|
||||||
int info = 0;
|
int info = 0;
|
||||||
int * ipiv = (int*)alloca(sizeof(int)*N);
|
int * ipiv = (int*)alloca(sizeof(int)*N);
|
||||||
int * jpiv = (int*)alloca(sizeof(int)*N);
|
int * jpiv = (int*)alloca(sizeof(int)*N);
|
||||||
@ -134,8 +132,6 @@ public :
|
|||||||
}
|
}
|
||||||
char uplo = 'U';
|
char uplo = 'U';
|
||||||
int info = 0;
|
int info = 0;
|
||||||
int ilo = 1;
|
|
||||||
int ihi = N;
|
|
||||||
int bsize = 64;
|
int bsize = 64;
|
||||||
int worksize = N*bsize;
|
int worksize = N*bsize;
|
||||||
SCALAR* d = new SCALAR[3*N+worksize];
|
SCALAR* d = new SCALAR[3*N+worksize];
|
||||||
|
@ -17,12 +17,12 @@ public:
|
|||||||
typedef real* gene_matrix;
|
typedef real* gene_matrix;
|
||||||
typedef real* gene_vector;
|
typedef real* gene_vector;
|
||||||
|
|
||||||
static void free_matrix(gene_matrix & A, int N){
|
static void free_matrix(gene_matrix & A, int /*N*/){
|
||||||
delete A;
|
delete[] A;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void free_vector(gene_vector & B){
|
static void free_vector(gene_vector & B){
|
||||||
delete B;
|
delete[] B;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||||
|
@ -56,13 +56,13 @@ int main()
|
|||||||
|
|
||||||
bench<Action_trmm<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_trmm<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||||
|
|
||||||
bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
|
|
||||||
#ifdef HAS_LAPACK
|
#ifdef HAS_LAPACK
|
||||||
bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
// bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//bench<Action_lu_solve<blas_LU_solve_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
//bench<Action_lu_solve<blas_LU_solve_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
|
@ -44,9 +44,9 @@ public :
|
|||||||
return "STL";
|
return "STL";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void free_matrix(gene_matrix & A, int N){}
|
static void free_matrix(gene_matrix & /*A*/, int /*N*/){}
|
||||||
|
|
||||||
static void free_vector(gene_vector & B){}
|
static void free_vector(gene_vector & /*B*/){}
|
||||||
|
|
||||||
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||||
A = A_stl;
|
A = A_stl;
|
||||||
|
10
bench/btl/libs/blaze/CMakeLists.txt
Normal file
10
bench/btl/libs/blaze/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
|
||||||
|
find_package(BLAZE)
|
||||||
|
find_package(Boost)
|
||||||
|
if (BLAZE_FOUND AND Boost_FOUND)
|
||||||
|
include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
|
||||||
|
btl_add_bench(btl_blaze main.cpp)
|
||||||
|
if(BUILD_btl_blaze)
|
||||||
|
target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
|
||||||
|
endif()
|
||||||
|
endif ()
|
140
bench/btl/libs/blaze/blaze_interface.hh
Normal file
140
bench/btl/libs/blaze/blaze_interface.hh
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
//=====================================================
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||||
|
//=====================================================
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License
|
||||||
|
// as published by the Free Software Foundation; either version 2
|
||||||
|
// of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
//
|
||||||
|
#ifndef BLAZE_INTERFACE_HH
|
||||||
|
#define BLAZE_INTERFACE_HH
|
||||||
|
|
||||||
|
#include <blaze/Math.h>
|
||||||
|
#include <blaze/Blaze.h>
|
||||||
|
// using namespace blaze;
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
template<class real>
|
||||||
|
class blaze_interface {
|
||||||
|
|
||||||
|
public :
|
||||||
|
|
||||||
|
typedef real real_type ;
|
||||||
|
|
||||||
|
typedef std::vector<real> stl_vector;
|
||||||
|
typedef std::vector<stl_vector > stl_matrix;
|
||||||
|
|
||||||
|
typedef blaze::DynamicMatrix<real,blaze::columnMajor> gene_matrix;
|
||||||
|
typedef blaze::DynamicVector<real> gene_vector;
|
||||||
|
|
||||||
|
static inline std::string name() { return "blaze"; }
|
||||||
|
|
||||||
|
static void free_matrix(gene_matrix & A, int N){
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void free_vector(gene_vector & B){
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||||
|
A.resize(A_stl[0].size(), A_stl.size());
|
||||||
|
|
||||||
|
for (int j=0; j<A_stl.size() ; j++){
|
||||||
|
for (int i=0; i<A_stl[j].size() ; i++){
|
||||||
|
A(i,j) = A_stl[j][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
|
||||||
|
B.resize(B_stl.size());
|
||||||
|
for (int i=0; i<B_stl.size() ; i++){
|
||||||
|
B[i] = B_stl[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){
|
||||||
|
for (int i=0; i<B_stl.size() ; i++){
|
||||||
|
B_stl[i] = B[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||||
|
int N=A_stl.size();
|
||||||
|
for (int j=0;j<N;j++){
|
||||||
|
A_stl[j].resize(N);
|
||||||
|
for (int i=0;i<N;i++){
|
||||||
|
A_stl[j][i] = A(i,j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
|
||||||
|
X = (A*B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
|
||||||
|
X = A.transpose()*B.transpose();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
|
||||||
|
X = (A.transpose()*A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
|
||||||
|
X = (A*A.transpose());
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
|
||||||
|
X = (A*B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
|
||||||
|
X = (A.transpose()*B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
|
||||||
|
Y += coef * X;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
|
||||||
|
Y = a*X + b*Y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
|
||||||
|
// C = X;
|
||||||
|
// recursive_cholesky(C);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// static inline void lu_decomp(const gene_matrix & X, gene_matrix & R, int N){
|
||||||
|
// R = X;
|
||||||
|
// std::vector<int> ipvt(N);
|
||||||
|
// lu_factor(R, ipvt);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){
|
||||||
|
// X = lower_trisolve(L, B);
|
||||||
|
// }
|
||||||
|
|
||||||
|
static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
|
||||||
|
cible = source;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){
|
||||||
|
cible = source;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
40
bench/btl/libs/blaze/main.cpp
Normal file
40
bench/btl/libs/blaze/main.cpp
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
//=====================================================
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||||
|
//=====================================================
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License
|
||||||
|
// as published by the Free Software Foundation; either version 2
|
||||||
|
// of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
//
|
||||||
|
#include "utilities.h"
|
||||||
|
#include "blaze_interface.hh"
|
||||||
|
#include "bench.hh"
|
||||||
|
#include "basic_actions.hh"
|
||||||
|
|
||||||
|
BTL_MAIN;
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
|
||||||
|
bench<Action_axpy<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
|
||||||
|
bench<Action_axpby<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
|
||||||
|
|
||||||
|
bench<Action_matrix_vector_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
|
||||||
|
// bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
|
||||||
|
// bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||||
|
// bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||||
|
// bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -45,9 +45,9 @@ public :
|
|||||||
return EIGEN_MAKESTRING(BTL_PREFIX);
|
return EIGEN_MAKESTRING(BTL_PREFIX);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void free_matrix(gene_matrix & A, int N) {}
|
static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
|
||||||
|
|
||||||
static void free_vector(gene_vector & B) {}
|
static void free_vector(gene_vector & /*B*/) {}
|
||||||
|
|
||||||
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||||
A.resize(A_stl[0].size(), A_stl.size());
|
A.resize(A_stl[0].size(), A_stl.size());
|
||||||
@ -74,7 +74,7 @@ public :
|
|||||||
}
|
}
|
||||||
|
|
||||||
static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
|
static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||||
int N=A_stl.size();
|
int N=A_stl.size();
|
||||||
|
|
||||||
for (int j=0;j<N;j++){
|
for (int j=0;j<N;j++){
|
||||||
A_stl[j].resize(N);
|
A_stl[j].resize(N);
|
||||||
@ -84,28 +84,28 @@ public :
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
|
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
|
||||||
X.noalias() = A*B;
|
X.noalias() = A*B;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
|
static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
|
||||||
X.noalias() = A.transpose()*B.transpose();
|
X.noalias() = A.transpose()*B.transpose();
|
||||||
}
|
}
|
||||||
|
|
||||||
// static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
|
// static inline void ata_product(const gene_matrix & A, gene_matrix & X, int /*N*/){
|
||||||
// X.noalias() = A.transpose()*A;
|
// X.noalias() = A.transpose()*A;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
|
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int /*N*/){
|
||||||
X.template triangularView<Lower>().setZero();
|
X.template triangularView<Lower>().setZero();
|
||||||
X.template selfadjointView<Lower>().rankUpdate(A);
|
X.template selfadjointView<Lower>().rankUpdate(A);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
|
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
|
||||||
X.noalias() = A*B;
|
X.noalias() = A*B;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
|
static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
|
||||||
X.noalias() = (A.template selfadjointView<Lower>() * B);
|
X.noalias() = (A.template selfadjointView<Lower>() * B);
|
||||||
// internal::product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
|
// internal::product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
|
||||||
}
|
}
|
||||||
@ -155,54 +155,54 @@ public :
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){
|
static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){
|
||||||
// internal::product_selfadjoint_rank2_update<real,0,LowerTriangularBit>(N,A.data(),N, X.data(), 1, Y.data(), 1, -1);
|
// internal::product_selfadjoint_rank2_update<real,0,LowerTriangularBit>(N,A.data(),N, X.data(), 1, Y.data(), 1, -1);
|
||||||
for(int j=0; j<N; ++j)
|
for(int j=0; j<N; ++j)
|
||||||
A.col(j).tail(N-j) += X[j] * Y.tail(N-j) + Y[j] * X.tail(N-j);
|
A.col(j).tail(N-j) += X[j] * Y.tail(N-j) + Y[j] * X.tail(N-j);
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DONT_INLINE void ger(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){
|
static EIGEN_DONT_INLINE void ger(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){
|
||||||
for(int j=0; j<N; ++j)
|
for(int j=0; j<N; ++j)
|
||||||
A.col(j) += X * Y[j];
|
A.col(j) += X * Y[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DONT_INLINE void rot(gene_vector & A, gene_vector & B, real c, real s, int N){
|
static EIGEN_DONT_INLINE void rot(gene_vector & A, gene_vector & B, real c, real s, int /*N*/){
|
||||||
internal::apply_rotation_in_the_plane(A, B, JacobiRotation<real>(c,s));
|
internal::apply_rotation_in_the_plane(A, B, JacobiRotation<real>(c,s));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
|
static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int /*N*/){
|
||||||
X.noalias() = (A.transpose()*B);
|
X.noalias() = (A.transpose()*B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
|
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){
|
||||||
Y += coef * X;
|
Y += coef * X;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
|
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){
|
||||||
Y = a*X + b*Y;
|
Y = a*X + b*Y;
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
|
static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){
|
||||||
cible = source;
|
cible = source;
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){
|
static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){
|
||||||
cible = source;
|
cible = source;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){
|
static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int /*N*/){
|
||||||
X = L.template triangularView<Lower>().solve(B);
|
X = L.template triangularView<Lower>().solve(B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
|
static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){
|
||||||
X = L.template triangularView<Upper>().solve(B);
|
X = L.template triangularView<Upper>().solve(B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
|
static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int /*N*/){
|
||||||
X.noalias() = L.template triangularView<Lower>() * B;
|
X.noalias() = L.template triangularView<Lower>() * B;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void cholesky(const gene_matrix & X, gene_matrix & C, int /*N*/){
|
||||||
C = X;
|
C = X;
|
||||||
internal::llt_inplace<real,Lower>::blocked(C);
|
internal::llt_inplace<real,Lower>::blocked(C);
|
||||||
//C = X.llt().matrixL();
|
//C = X.llt().matrixL();
|
||||||
@ -211,11 +211,11 @@ public :
|
|||||||
// Cholesky<gene_matrix>::computeInPlaceBlock(C);
|
// Cholesky<gene_matrix>::computeInPlaceBlock(C);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int /*N*/){
|
||||||
C = X.fullPivLu().matrixLU();
|
C = X.fullPivLu().matrixLU();
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
|
||||||
Matrix<DenseIndex,1,Dynamic> piv(N);
|
Matrix<DenseIndex,1,Dynamic> piv(N);
|
||||||
DenseIndex nb;
|
DenseIndex nb;
|
||||||
C = X;
|
C = X;
|
||||||
@ -223,13 +223,13 @@ public :
|
|||||||
// C = X.partialPivLu().matrixLU();
|
// C = X.partialPivLu().matrixLU();
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){
|
||||||
typename Tridiagonalization<gene_matrix>::CoeffVectorType aux(N-1);
|
typename Tridiagonalization<gene_matrix>::CoeffVectorType aux(N-1);
|
||||||
C = X;
|
C = X;
|
||||||
internal::tridiagonalization_inplace(C, aux);
|
internal::tridiagonalization_inplace(C, aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){
|
static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int /*N*/){
|
||||||
C = HessenbergDecomposition<gene_matrix>(X).packedMatrix();
|
C = HessenbergDecomposition<gene_matrix>(X).packedMatrix();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,14 +29,14 @@ BTL_MAIN;
|
|||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
// bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
|
|
||||||
bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
// bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
76
bench/dense_solvers.cpp
Normal file
76
bench/dense_solvers.cpp
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include "BenchTimer.h"
|
||||||
|
#include <Eigen/Dense>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
std::map<std::string,Array<float,1,4> > results;
|
||||||
|
|
||||||
|
template<typename Scalar,int Size>
|
||||||
|
void bench(int id, int size = Size)
|
||||||
|
{
|
||||||
|
typedef Matrix<Scalar,Size,Size> Mat;
|
||||||
|
Mat A(size,size);
|
||||||
|
A.setRandom();
|
||||||
|
A = A*A.adjoint();
|
||||||
|
BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd;
|
||||||
|
|
||||||
|
int tries = 3;
|
||||||
|
int rep = 1000/size;
|
||||||
|
if(rep==0) rep = 1;
|
||||||
|
rep = rep*rep;
|
||||||
|
|
||||||
|
LLT<Mat> llt(A);
|
||||||
|
LDLT<Mat> ldlt(A);
|
||||||
|
PartialPivLU<Mat> lu(A);
|
||||||
|
FullPivLU<Mat> fplu(A);
|
||||||
|
HouseholderQR<Mat> qr(A);
|
||||||
|
ColPivHouseholderQR<Mat> cpqr(A);
|
||||||
|
FullPivHouseholderQR<Mat> fpqr(A);
|
||||||
|
JacobiSVD<Mat> jsvd(A.rows(),A.cols());
|
||||||
|
|
||||||
|
BENCH(t_llt, tries, rep, llt.compute(A));
|
||||||
|
BENCH(t_ldlt, tries, rep, ldlt.compute(A));
|
||||||
|
BENCH(t_lu, tries, rep, lu.compute(A));
|
||||||
|
BENCH(t_fplu, tries, rep, fplu.compute(A));
|
||||||
|
BENCH(t_qr, tries, rep, qr.compute(A));
|
||||||
|
BENCH(t_cpqr, tries, rep, cpqr.compute(A));
|
||||||
|
BENCH(t_fpqr, tries, rep, fpqr.compute(A));
|
||||||
|
if(size<500) // JacobiSVD is really too slow for too large matrices
|
||||||
|
BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
|
||||||
|
|
||||||
|
results["LLT"][id] = t_llt.best();
|
||||||
|
results["LDLT"][id] = t_ldlt.best();
|
||||||
|
results["PartialPivLU"][id] = t_lu.best();
|
||||||
|
results["FullPivLU"][id] = t_fplu.best();
|
||||||
|
results["HouseholderQR"][id] = t_qr.best();
|
||||||
|
results["ColPivHouseholderQR"][id] = t_cpqr.best();
|
||||||
|
results["FullPivHouseholderQR"][id] = t_fpqr.best();
|
||||||
|
results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
const int small = 8;
|
||||||
|
const int medium = 100;
|
||||||
|
const int large = 1000;
|
||||||
|
const int xl = 4000;
|
||||||
|
|
||||||
|
bench<float,small>(0);
|
||||||
|
bench<float,Dynamic>(1,medium);
|
||||||
|
bench<float,Dynamic>(2,large);
|
||||||
|
bench<float,Dynamic>(3,xl);
|
||||||
|
|
||||||
|
IOFormat fmt(3, 0, " \t", "\n", "", "");
|
||||||
|
|
||||||
|
std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
|
||||||
|
std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n";
|
||||||
|
std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
|
||||||
|
}
|
@ -91,4 +91,5 @@ void printBenchStyle(std::ofstream& out)
|
|||||||
</xsl:stylesheet>\n\n";
|
</xsl:stylesheet>\n\n";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -1,9 +1,6 @@
|
|||||||
|
|
||||||
This directory contains a BLAS library built on top of Eigen.
|
This directory contains a BLAS library built on top of Eigen.
|
||||||
|
|
||||||
This is currently a work in progress which is far to be ready for use,
|
|
||||||
but feel free to contribute to it if you wish.
|
|
||||||
|
|
||||||
This module is not built by default. In order to compile it, you need to
|
This module is not built by default. In order to compile it, you need to
|
||||||
type 'make blas' from within your build dir.
|
type 'make blas' from within your build dir.
|
||||||
|
|
||||||
|
@ -106,13 +106,13 @@ matrix(T* data, int rows, int cols, int stride)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > vector(T* data, int size, int incr)
|
Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(T* data, int size, int incr)
|
||||||
{
|
{
|
||||||
return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
|
return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Map<Matrix<T,Dynamic,1> > vector(T* data, int size)
|
Map<Matrix<T,Dynamic,1> > make_vector(T* data, int size)
|
||||||
{
|
{
|
||||||
return Map<Matrix<T,Dynamic,1> >(data, size);
|
return Map<Matrix<T,Dynamic,1> >(data, size);
|
||||||
}
|
}
|
||||||
@ -124,8 +124,8 @@ T* get_compact_vector(T* x, int n, int incx)
|
|||||||
return x;
|
return x;
|
||||||
|
|
||||||
T* ret = new Scalar[n];
|
T* ret = new Scalar[n];
|
||||||
if(incx<0) vector(ret,n) = vector(x,n,-incx).reverse();
|
if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse();
|
||||||
else vector(ret,n) = vector(x,n, incx);
|
else make_vector(ret,n) = make_vector(x,n, incx);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,8 +135,8 @@ T* copy_back(T* x_cpy, T* x, int n, int incx)
|
|||||||
if(x_cpy==x)
|
if(x_cpy==x)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if(incx<0) vector(x,n,-incx).reverse() = vector(x_cpy,n);
|
if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n);
|
||||||
else vector(x,n, incx) = vector(x_cpy,n);
|
else make_vector(x,n, incx) = make_vector(x_cpy,n);
|
||||||
return x_cpy;
|
return x_cpy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,11 +23,10 @@ double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy)
|
|||||||
{
|
{
|
||||||
if(*n<=0) return 0;
|
if(*n<=0) return 0;
|
||||||
|
|
||||||
if(*incx==1 && *incy==1) return (vector(x,*n).cast<double>().cwiseProduct(vector(y,*n).cast<double>())).sum();
|
if(*incx==1 && *incy==1) return (make_vector(x,*n).cast<double>().cwiseProduct(make_vector(y,*n).cast<double>())).sum();
|
||||||
else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum();
|
else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
|
||||||
else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum();
|
else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
|
||||||
else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum();
|
else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
|
||||||
else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum();
|
else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
|
||||||
else return 0;
|
else return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,13 +32,14 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n,
|
|||||||
|
|
||||||
if(*n<=0) return 0;
|
if(*n<=0) return 0;
|
||||||
|
|
||||||
if(*incx==1) return vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
|
if(*incx==1) return make_vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
|
||||||
else return vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
|
else return make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
|
||||||
}
|
}
|
||||||
|
|
||||||
// computes a dot product of a conjugated vector with another vector.
|
// computes a dot product of a conjugated vector with another vector.
|
||||||
int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
|
int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
|
||||||
{
|
{
|
||||||
|
// std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
|
||||||
Scalar* res = reinterpret_cast<Scalar*>(pres);
|
Scalar* res = reinterpret_cast<Scalar*>(pres);
|
||||||
|
|
||||||
if(*n<=0)
|
if(*n<=0)
|
||||||
@ -50,11 +51,11 @@ int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
Scalar* y = reinterpret_cast<Scalar*>(py);
|
Scalar* y = reinterpret_cast<Scalar*>(py);
|
||||||
|
|
||||||
if(*incx==1 && *incy==1) *res = (vector(x,*n).dot(vector(y,*n)));
|
if(*incx==1 && *incy==1) *res = (make_vector(x,*n).dot(make_vector(y,*n)));
|
||||||
else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).dot(vector(y,*n,*incy)));
|
else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy)));
|
||||||
else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,*incy)));
|
else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy)));
|
||||||
else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).dot(vector(y,*n,-*incy).reverse()));
|
else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse()));
|
||||||
else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,-*incy).reverse()));
|
else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse()));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,11 +73,11 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
Scalar* y = reinterpret_cast<Scalar*>(py);
|
Scalar* y = reinterpret_cast<Scalar*>(py);
|
||||||
|
|
||||||
if(*incx==1 && *incy==1) *res = (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
|
if(*incx==1 && *incy==1) *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
|
||||||
else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
|
else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
|
||||||
else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
|
else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
|
||||||
else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
|
else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
|
||||||
else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
|
else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,9 +89,9 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n,
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
|
|
||||||
if(*incx==1)
|
if(*incx==1)
|
||||||
return vector(x,*n).stableNorm();
|
return make_vector(x,*n).stableNorm();
|
||||||
|
|
||||||
return vector(x,*n,*incx).stableNorm();
|
return make_vector(x,*n,*incx).stableNorm();
|
||||||
}
|
}
|
||||||
|
|
||||||
int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
|
int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
|
||||||
@ -102,8 +103,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal
|
|||||||
RealScalar c = *pc;
|
RealScalar c = *pc;
|
||||||
RealScalar s = *ps;
|
RealScalar s = *ps;
|
||||||
|
|
||||||
StridedVectorType vx(vector(x,*n,std::abs(*incx)));
|
StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
|
||||||
StridedVectorType vy(vector(y,*n,std::abs(*incy)));
|
StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
|
||||||
|
|
||||||
Reverse<StridedVectorType> rvx(vx);
|
Reverse<StridedVectorType> rvx(vx);
|
||||||
Reverse<StridedVectorType> rvy(vy);
|
Reverse<StridedVectorType> rvy(vy);
|
||||||
@ -125,9 +126,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca
|
|||||||
|
|
||||||
// std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
|
// std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
|
||||||
|
|
||||||
if(*incx==1) vector(x,*n) *= alpha;
|
if(*incx==1) make_vector(x,*n) *= alpha;
|
||||||
else vector(x,*n,std::abs(*incx)) *= alpha;
|
else make_vector(x,*n,std::abs(*incx)) *= alpha;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,11 +17,11 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx,
|
|||||||
|
|
||||||
if(*n<=0) return 0;
|
if(*n<=0) return 0;
|
||||||
|
|
||||||
if(*incx==1 && *incy==1) vector(y,*n) += alpha * vector(x,*n);
|
if(*incx==1 && *incy==1) make_vector(y,*n) += alpha * make_vector(x,*n);
|
||||||
else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
|
else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx);
|
||||||
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx);
|
else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx);
|
||||||
else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse();
|
else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse();
|
||||||
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse();
|
else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -35,7 +35,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
|
|||||||
|
|
||||||
// be carefull, *incx==0 is allowed !!
|
// be carefull, *incx==0 is allowed !!
|
||||||
if(*incx==1 && *incy==1)
|
if(*incx==1 && *incy==1)
|
||||||
vector(y,*n) = vector(x,*n);
|
make_vector(y,*n) = make_vector(x,*n);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if(*incx<0) x = x - (*n-1)*(*incx);
|
if(*incx<0) x = x - (*n-1)*(*incx);
|
||||||
@ -57,8 +57,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
|
|
||||||
DenseIndex ret;
|
DenseIndex ret;
|
||||||
if(*incx==1) vector(x,*n).cwiseAbs().maxCoeff(&ret);
|
if(*incx==1) make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
|
||||||
else vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
|
else make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
|
||||||
return ret+1;
|
return ret+1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,8 +68,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *inc
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
|
|
||||||
DenseIndex ret;
|
DenseIndex ret;
|
||||||
if(*incx==1) vector(x,*n).cwiseAbs().minCoeff(&ret);
|
if(*incx==1) make_vector(x,*n).cwiseAbs().minCoeff(&ret);
|
||||||
else vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
|
else make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
|
||||||
return ret+1;
|
return ret+1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,8 +143,8 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
|
Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
|
||||||
|
|
||||||
if(*incx==1) vector(x,*n) *= alpha;
|
if(*incx==1) make_vector(x,*n) *= alpha;
|
||||||
else vector(x,*n,std::abs(*incx)) *= alpha;
|
else make_vector(x,*n,std::abs(*incx)) *= alpha;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -156,12 +156,11 @@ int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
Scalar* y = reinterpret_cast<Scalar*>(py);
|
Scalar* y = reinterpret_cast<Scalar*>(py);
|
||||||
|
|
||||||
if(*incx==1 && *incy==1) vector(y,*n).swap(vector(x,*n));
|
if(*incx==1 && *incy==1) make_vector(y,*n).swap(make_vector(x,*n));
|
||||||
else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx));
|
else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx));
|
||||||
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx));
|
else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx));
|
||||||
else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse());
|
else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse());
|
||||||
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse());
|
else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse());
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,8 +19,8 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
|
|||||||
|
|
||||||
if(*n<=0) return 0;
|
if(*n<=0) return 0;
|
||||||
|
|
||||||
if(*incx==1) return vector(x,*n).cwiseAbs().sum();
|
if(*incx==1) return make_vector(x,*n).cwiseAbs().sum();
|
||||||
else return vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
|
else return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
|
||||||
}
|
}
|
||||||
|
|
||||||
// computes a vector-vector dot product.
|
// computes a vector-vector dot product.
|
||||||
@ -33,11 +33,11 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i
|
|||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
Scalar* y = reinterpret_cast<Scalar*>(py);
|
Scalar* y = reinterpret_cast<Scalar*>(py);
|
||||||
|
|
||||||
if(*incx==1 && *incy==1) return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
|
if(*incx==1 && *incy==1) return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
|
||||||
else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
|
else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
|
||||||
else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
|
else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
|
||||||
else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
|
else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
|
||||||
else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
|
else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
|
||||||
else return 0;
|
else return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,8 +50,8 @@ Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
|
|||||||
|
|
||||||
Scalar* x = reinterpret_cast<Scalar*>(px);
|
Scalar* x = reinterpret_cast<Scalar*>(px);
|
||||||
|
|
||||||
if(*incx==1) return vector(x,*n).stableNorm();
|
if(*incx==1) return make_vector(x,*n).stableNorm();
|
||||||
else return vector(x,*n,std::abs(*incx)).stableNorm();
|
else return make_vector(x,*n,std::abs(*incx)).stableNorm();
|
||||||
}
|
}
|
||||||
|
|
||||||
int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
|
int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
|
||||||
@ -64,8 +64,8 @@ int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int
|
|||||||
Scalar c = *reinterpret_cast<Scalar*>(pc);
|
Scalar c = *reinterpret_cast<Scalar*>(pc);
|
||||||
Scalar s = *reinterpret_cast<Scalar*>(ps);
|
Scalar s = *reinterpret_cast<Scalar*>(ps);
|
||||||
|
|
||||||
StridedVectorType vx(vector(x,*n,std::abs(*incx)));
|
StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
|
||||||
StridedVectorType vy(vector(y,*n,std::abs(*incy)));
|
StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
|
||||||
|
|
||||||
Reverse<StridedVectorType> rvx(vx);
|
Reverse<StridedVectorType> rvx(vx);
|
||||||
Reverse<StridedVectorType> rvy(vy);
|
Reverse<StridedVectorType> rvy(vy);
|
||||||
|
@ -57,8 +57,8 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
|
|||||||
|
|
||||||
if(beta!=Scalar(1))
|
if(beta!=Scalar(1))
|
||||||
{
|
{
|
||||||
if(beta==Scalar(0)) vector(actual_y, *n).setZero();
|
if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
|
||||||
else vector(actual_y, *n) *= beta;
|
else make_vector(actual_y, *n) *= beta;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(alpha!=Scalar(0))
|
if(alpha!=Scalar(0))
|
||||||
|
@ -58,8 +58,8 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
|
|||||||
|
|
||||||
if(beta!=Scalar(1))
|
if(beta!=Scalar(1))
|
||||||
{
|
{
|
||||||
if(beta==Scalar(0)) vector(actual_c, actual_m).setZero();
|
if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero();
|
||||||
else vector(actual_c, actual_m) *= beta;
|
else make_vector(actual_c, actual_m) *= beta;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(code>=4 || func[code]==0)
|
if(code>=4 || func[code]==0)
|
||||||
@ -232,8 +232,8 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
|
|||||||
|
|
||||||
if(beta!=Scalar(1))
|
if(beta!=Scalar(1))
|
||||||
{
|
{
|
||||||
if(beta==Scalar(0)) vector(actual_y, actual_m).setZero();
|
if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero();
|
||||||
else vector(actual_y, actual_m) *= beta;
|
else make_vector(actual_y, actual_m) *= beta;
|
||||||
}
|
}
|
||||||
|
|
||||||
MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
|
MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
|
||||||
@ -246,11 +246,11 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
|
|||||||
int len = end - start + 1;
|
int len = end - start + 1;
|
||||||
int offset = (*ku) - j + start;
|
int offset = (*ku) - j + start;
|
||||||
if(OP(*trans)==NOTR)
|
if(OP(*trans)==NOTR)
|
||||||
vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
|
make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
|
||||||
else if(OP(*trans)==TR)
|
else if(OP(*trans)==TR)
|
||||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value();
|
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
|
||||||
else
|
else
|
||||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * vector(actual_x+start,len) ).value();
|
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
|
||||||
}
|
}
|
||||||
|
|
||||||
if(actual_x!=x) delete[] actual_x;
|
if(actual_x!=x) delete[] actual_x;
|
||||||
@ -304,11 +304,11 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
|
|||||||
int offset = (ku) - j + start;
|
int offset = (ku) - j + start;
|
||||||
|
|
||||||
if(OP(*trans)==NOTR)
|
if(OP(*trans)==NOTR)
|
||||||
vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
|
make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
|
||||||
else if(OP(*trans)==TR)
|
else if(OP(*trans)==TR)
|
||||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value();
|
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
|
||||||
else
|
else
|
||||||
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * vector(actual_x+start,len) ).value();
|
actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint() * make_vector(actual_x+start,len) ).value();
|
||||||
}
|
}
|
||||||
|
|
||||||
if(actual_x!=x) delete[] actual_x;
|
if(actual_x!=x) delete[] actual_x;
|
||||||
@ -521,4 +521,3 @@ int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
|
|||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,8 +51,8 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
|
|||||||
|
|
||||||
if(beta!=Scalar(1))
|
if(beta!=Scalar(1))
|
||||||
{
|
{
|
||||||
if(beta==Scalar(0)) vector(actual_y, *n).setZero();
|
if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
|
||||||
else vector(actual_y, *n) *= beta;
|
else make_vector(actual_y, *n) *= beta;
|
||||||
}
|
}
|
||||||
|
|
||||||
int code = UPLO(*uplo);
|
int code = UPLO(*uplo);
|
||||||
@ -366,5 +366,3 @@ int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx,
|
|||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#if (defined __GNUC__)
|
#if (defined __GNUC__) && (!defined __MINGW32__)
|
||||||
#define EIGEN_WEAK_LINKING __attribute__ ((weak))
|
#define EIGEN_WEAK_LINKING __attribute__ ((weak))
|
||||||
#else
|
#else
|
||||||
#define EIGEN_WEAK_LINKING
|
#define EIGEN_WEAK_LINKING
|
||||||
|
@ -3,26 +3,26 @@
|
|||||||
# Eigen3Config.cmake(.in)
|
# Eigen3Config.cmake(.in)
|
||||||
|
|
||||||
# Use the following variables to compile and link against Eigen:
|
# Use the following variables to compile and link against Eigen:
|
||||||
# EIGEN_FOUND - True if Eigen was found on your system
|
# EIGEN3_FOUND - True if Eigen was found on your system
|
||||||
# EIGEN_USE_FILE - The file making Eigen usable
|
# EIGEN3_USE_FILE - The file making Eigen usable
|
||||||
# EIGEN_DEFINITIONS - Definitions needed to build with Eigen
|
# EIGEN3_DEFINITIONS - Definitions needed to build with Eigen
|
||||||
# EIGEN_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found
|
# EIGEN3_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found
|
||||||
# EIGEN_INCLUDE_DIRS - List of directories of Eigen and it's dependencies
|
# EIGEN3_INCLUDE_DIRS - List of directories of Eigen and it's dependencies
|
||||||
# EIGEN_ROOT_DIR - The base directory of Eigen
|
# EIGEN3_ROOT_DIR - The base directory of Eigen
|
||||||
# EIGEN_VERSION_STRING - A human-readable string containing the version
|
# EIGEN3_VERSION_STRING - A human-readable string containing the version
|
||||||
# EIGEN_VERSION_MAJOR - The major version of Eigen
|
# EIGEN3_VERSION_MAJOR - The major version of Eigen
|
||||||
# EIGEN_VERSION_MINOR - The minor version of Eigen
|
# EIGEN3_VERSION_MINOR - The minor version of Eigen
|
||||||
# EIGEN_VERSION_PATCH - The patch version of Eigen
|
# EIGEN3_VERSION_PATCH - The patch version of Eigen
|
||||||
|
|
||||||
set ( EIGEN_FOUND 1 )
|
set ( EIGEN3_FOUND 1 )
|
||||||
set ( EIGEN_USE_FILE "@EIGEN_USE_FILE@" )
|
set ( EIGEN3_USE_FILE "@EIGEN_USE_FILE@" )
|
||||||
|
|
||||||
set ( EIGEN_DEFINITIONS "@EIGEN_DEFINITIONS@" )
|
set ( EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@" )
|
||||||
set ( EIGEN_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" )
|
set ( EIGEN3_INCLUDE_DIR "@EIGEN_INCLUDE_DIR@" )
|
||||||
set ( EIGEN_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" )
|
set ( EIGEN3_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" )
|
||||||
set ( EIGEN_ROOT_DIR "@EIGEN_ROOT_DIR@" )
|
set ( EIGEN3_ROOT_DIR "@EIGEN_ROOT_DIR@" )
|
||||||
|
|
||||||
set ( EIGEN_VERSION_STRING "@EIGEN_VERSION_STRING@" )
|
set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
|
||||||
set ( EIGEN_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" )
|
set ( EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" )
|
||||||
set ( EIGEN_VERSION_MINOR "@EIGEN_VERSION_MINOR@" )
|
set ( EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@" )
|
||||||
set ( EIGEN_VERSION_PATCH "@EIGEN_VERSION_PATCH@" )
|
set ( EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@" )
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user