mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Merged eigen/eigen into default
This commit is contained in:
commit
1affe3d8df
@ -9,6 +9,7 @@
|
|||||||
#define EIGEN_CHOLESKY_MODULE_H
|
#define EIGEN_CHOLESKY_MODULE_H
|
||||||
|
|
||||||
#include "Core"
|
#include "Core"
|
||||||
|
#include "Jacobi"
|
||||||
|
|
||||||
#include "src/Core/util/DisableStupidWarnings.h"
|
#include "src/Core/util/DisableStupidWarnings.h"
|
||||||
|
|
||||||
@ -31,7 +32,11 @@
|
|||||||
#include "src/Cholesky/LLT.h"
|
#include "src/Cholesky/LLT.h"
|
||||||
#include "src/Cholesky/LDLT.h"
|
#include "src/Cholesky/LDLT.h"
|
||||||
#ifdef EIGEN_USE_LAPACKE
|
#ifdef EIGEN_USE_LAPACKE
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
#include "mkl_lapacke.h"
|
||||||
|
#else
|
||||||
#include "src/misc/lapacke.h"
|
#include "src/misc/lapacke.h"
|
||||||
|
#endif
|
||||||
#include "src/Cholesky/LLT_LAPACKE.h"
|
#include "src/Cholesky/LLT_LAPACKE.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
32
Eigen/Core
32
Eigen/Core
@ -14,8 +14,24 @@
|
|||||||
// first thing Eigen does: stop the compiler from committing suicide
|
// first thing Eigen does: stop the compiler from committing suicide
|
||||||
#include "src/Core/util/DisableStupidWarnings.h"
|
#include "src/Core/util/DisableStupidWarnings.h"
|
||||||
|
|
||||||
|
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
||||||
|
#define EIGEN_CUDACC __CUDACC__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
|
||||||
|
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||||
|
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||||
|
#elif defined(__CUDACC_VER__)
|
||||||
|
#define EIGEN_CUDACC_VER __CUDACC_VER__
|
||||||
|
#else
|
||||||
|
#define EIGEN_CUDACC_VER 0
|
||||||
|
#endif
|
||||||
|
|
||||||
// Handle NVCC/CUDA/SYCL
|
// Handle NVCC/CUDA/SYCL
|
||||||
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
|
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
|
||||||
// Do not try asserts on CUDA and SYCL!
|
// Do not try asserts on CUDA and SYCL!
|
||||||
#ifndef EIGEN_NO_DEBUG
|
#ifndef EIGEN_NO_DEBUG
|
||||||
#define EIGEN_NO_DEBUG
|
#define EIGEN_NO_DEBUG
|
||||||
@ -30,7 +46,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// All functions callable from CUDA code must be qualified with __device__
|
// All functions callable from CUDA code must be qualified with __device__
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
// Do not try to vectorize on CUDA and SYCL!
|
// Do not try to vectorize on CUDA and SYCL!
|
||||||
#ifndef EIGEN_DONT_VECTORIZE
|
#ifndef EIGEN_DONT_VECTORIZE
|
||||||
#define EIGEN_DONT_VECTORIZE
|
#define EIGEN_DONT_VECTORIZE
|
||||||
@ -47,16 +63,20 @@
|
|||||||
#define EIGEN_DEVICE_FUNC
|
#define EIGEN_DEVICE_FUNC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __NVCC__
|
||||||
|
#define EIGEN_DONT_VECTORIZE
|
||||||
|
#endif
|
||||||
|
|
||||||
// When compiling CUDA device code with NVCC, pull in math functions from the
|
// When compiling CUDA device code with NVCC, pull in math functions from the
|
||||||
// global namespace. In host mode, and when device doee with clang, use the
|
// global namespace. In host mode, and when device doee with clang, use the
|
||||||
// std versions.
|
// std versions.
|
||||||
#if defined(__CUDA_ARCH__) && defined(__NVCC__)
|
#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
|
||||||
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
|
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
|
||||||
#else
|
#else
|
||||||
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
|
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
|
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
|
||||||
#define EIGEN_EXCEPTIONS
|
#define EIGEN_EXCEPTIONS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -233,10 +253,10 @@
|
|||||||
#define EIGEN_HAS_FP16_C
|
#define EIGEN_HAS_FP16_C
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined __CUDACC__
|
#if defined EIGEN_CUDACC
|
||||||
#define EIGEN_VECTORIZE_CUDA
|
#define EIGEN_VECTORIZE_CUDA
|
||||||
#include <vector_types.h>
|
#include <vector_types.h>
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#define EIGEN_HAS_CUDA_FP16
|
#define EIGEN_HAS_CUDA_FP16
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -45,7 +45,11 @@
|
|||||||
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
|
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
|
||||||
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
|
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
|
||||||
#ifdef EIGEN_USE_LAPACKE
|
#ifdef EIGEN_USE_LAPACKE
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
#include "mkl_lapacke.h"
|
||||||
|
#else
|
||||||
#include "src/misc/lapacke.h"
|
#include "src/misc/lapacke.h"
|
||||||
|
#endif
|
||||||
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
|
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
|
||||||
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
|
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
|
||||||
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
|
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
|
||||||
|
4
Eigen/LU
4
Eigen/LU
@ -28,7 +28,11 @@
|
|||||||
#include "src/LU/FullPivLU.h"
|
#include "src/LU/FullPivLU.h"
|
||||||
#include "src/LU/PartialPivLU.h"
|
#include "src/LU/PartialPivLU.h"
|
||||||
#ifdef EIGEN_USE_LAPACKE
|
#ifdef EIGEN_USE_LAPACKE
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
#include "mkl_lapacke.h"
|
||||||
|
#else
|
||||||
#include "src/misc/lapacke.h"
|
#include "src/misc/lapacke.h"
|
||||||
|
#endif
|
||||||
#include "src/LU/PartialPivLU_LAPACKE.h"
|
#include "src/LU/PartialPivLU_LAPACKE.h"
|
||||||
#endif
|
#endif
|
||||||
#include "src/LU/Determinant.h"
|
#include "src/LU/Determinant.h"
|
||||||
|
4
Eigen/QR
4
Eigen/QR
@ -36,7 +36,11 @@
|
|||||||
#include "src/QR/ColPivHouseholderQR.h"
|
#include "src/QR/ColPivHouseholderQR.h"
|
||||||
#include "src/QR/CompleteOrthogonalDecomposition.h"
|
#include "src/QR/CompleteOrthogonalDecomposition.h"
|
||||||
#ifdef EIGEN_USE_LAPACKE
|
#ifdef EIGEN_USE_LAPACKE
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
#include "mkl_lapacke.h"
|
||||||
|
#else
|
||||||
#include "src/misc/lapacke.h"
|
#include "src/misc/lapacke.h"
|
||||||
|
#endif
|
||||||
#include "src/QR/HouseholderQR_LAPACKE.h"
|
#include "src/QR/HouseholderQR_LAPACKE.h"
|
||||||
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
|
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -37,7 +37,11 @@
|
|||||||
#include "src/SVD/JacobiSVD.h"
|
#include "src/SVD/JacobiSVD.h"
|
||||||
#include "src/SVD/BDCSVD.h"
|
#include "src/SVD/BDCSVD.h"
|
||||||
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
|
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
#include "mkl_lapacke.h"
|
||||||
|
#else
|
||||||
#include "src/misc/lapacke.h"
|
#include "src/misc/lapacke.h"
|
||||||
|
#endif
|
||||||
#include "src/SVD/JacobiSVD_LAPACKE.h"
|
#include "src/SVD/JacobiSVD_LAPACKE.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
|||||||
/** \brief Reports whether previous computation was successful.
|
/** \brief Reports whether previous computation was successful.
|
||||||
*
|
*
|
||||||
* \returns \c Success if computation was succesful,
|
* \returns \c Success if computation was succesful,
|
||||||
* \c NumericalIssue if the matrix.appears to be negative.
|
* \c NumericalIssue if the factorization failed because of a zero pivot.
|
||||||
*/
|
*/
|
||||||
ComputationInfo info() const
|
ComputationInfo info() const
|
||||||
{
|
{
|
||||||
|
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
|||||||
* Example: \include LLT_example.cpp
|
* Example: \include LLT_example.cpp
|
||||||
* Output: \verbinclude LLT_example.out
|
* Output: \verbinclude LLT_example.out
|
||||||
*
|
*
|
||||||
|
* \b Performance: for best performance, it is recommended to use a column-major storage format
|
||||||
|
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
|
||||||
|
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
|
||||||
|
* step, and rank-updates can be up to 3 times slower.
|
||||||
|
*
|
||||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||||
*
|
*
|
||||||
|
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
|
||||||
|
* Therefore, the strict lower part does not have to store correct values.
|
||||||
|
*
|
||||||
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
||||||
*/
|
*/
|
||||||
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
|
||||||
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
|
|
||||||
* the strict lower part does not have to store correct values.
|
|
||||||
*/
|
|
||||||
template<typename _MatrixType, int _UpLo> class LLT
|
template<typename _MatrixType, int _UpLo> class LLT
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
void solveInPlace(MatrixBase<Derived> &bAndX) const;
|
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
|
||||||
|
|
||||||
template<typename InputType>
|
template<typename InputType>
|
||||||
LLT& compute(const EigenBase<InputType>& matrix);
|
LLT& compute(const EigenBase<InputType>& matrix);
|
||||||
@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
|||||||
/** \brief Reports whether previous computation was successful.
|
/** \brief Reports whether previous computation was successful.
|
||||||
*
|
*
|
||||||
* \returns \c Success if computation was succesful,
|
* \returns \c Success if computation was succesful,
|
||||||
* \c NumericalIssue if the matrix.appears to be negative.
|
* \c NumericalIssue if the matrix.appears not to be positive definite.
|
||||||
*/
|
*/
|
||||||
ComputationInfo info() const
|
ComputationInfo info() const
|
||||||
{
|
{
|
||||||
@ -424,6 +428,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
|
|||||||
eigen_assert(a.rows()==a.cols());
|
eigen_assert(a.rows()==a.cols());
|
||||||
const Index size = a.rows();
|
const Index size = a.rows();
|
||||||
m_matrix.resize(size, size);
|
m_matrix.resize(size, size);
|
||||||
|
if (!internal::is_same_dense(m_matrix, a.derived()))
|
||||||
m_matrix = a.derived();
|
m_matrix = a.derived();
|
||||||
|
|
||||||
// Compute matrix L1 norm = max abs column sum.
|
// Compute matrix L1 norm = max abs column sum.
|
||||||
@ -484,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
|||||||
*
|
*
|
||||||
* This version avoids a copy when the right hand side matrix b is not needed anymore.
|
* This version avoids a copy when the right hand side matrix b is not needed anymore.
|
||||||
*
|
*
|
||||||
|
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
|
||||||
|
* This function will const_cast it, so constness isn't honored here.
|
||||||
|
*
|
||||||
* \sa LLT::solve(), MatrixBase::llt()
|
* \sa LLT::solve(), MatrixBase::llt()
|
||||||
*/
|
*/
|
||||||
template<typename MatrixType, int _UpLo>
|
template<typename MatrixType, int _UpLo>
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
|
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
|
||||||
{
|
{
|
||||||
eigen_assert(m_isInitialized && "LLT is not initialized.");
|
eigen_assert(m_isInitialized && "LLT is not initialized.");
|
||||||
eigen_assert(m_matrix.rows()==bAndX.rows());
|
eigen_assert(m_matrix.rows()==bAndX.rows());
|
||||||
|
@ -861,6 +861,42 @@ template<typename Derived>
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
||||||
{ return Derived::Unit(3); }
|
{ return Derived::Unit(3); }
|
||||||
|
|
||||||
|
/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
|
||||||
|
*
|
||||||
|
* \param i index of the unique coefficient to be set to 1
|
||||||
|
*
|
||||||
|
* \only_for_vectors
|
||||||
|
*
|
||||||
|
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
|
||||||
|
*/
|
||||||
|
template<typename Derived>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
|
||||||
|
{
|
||||||
|
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||||
|
eigen_assert(i<size());
|
||||||
|
derived().setZero();
|
||||||
|
derived().coeffRef(i) = Scalar(1);
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
|
||||||
|
*
|
||||||
|
* \param newSize the new size of the vector
|
||||||
|
* \param i index of the unique coefficient to be set to 1
|
||||||
|
*
|
||||||
|
* \only_for_vectors
|
||||||
|
*
|
||||||
|
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
|
||||||
|
*/
|
||||||
|
template<typename Derived>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
|
||||||
|
{
|
||||||
|
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||||
|
eigen_assert(i<newSize);
|
||||||
|
derived().resize(newSize);
|
||||||
|
return setUnit(i);
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_CWISE_NULLARY_OP_H
|
#endif // EIGEN_CWISE_NULLARY_OP_H
|
||||||
|
@ -18,15 +18,30 @@ enum {
|
|||||||
Small = 3
|
Small = 3
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Define the threshold value to fallback from the generic matrix-matrix product
|
||||||
|
// implementation (heavy) to the lightweight coeff-based product one.
|
||||||
|
// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||||
|
// in products/GeneralMatrixMatrix.h for more details.
|
||||||
|
// TODO This threshold should also be used in the compile-time selector below.
|
||||||
|
#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
|
||||||
|
// This default value has been obtained on a Haswell architecture.
|
||||||
|
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template<int Rows, int Cols, int Depth> struct product_type_selector;
|
template<int Rows, int Cols, int Depth> struct product_type_selector;
|
||||||
|
|
||||||
template<int Size, int MaxSize> struct product_size_category
|
template<int Size, int MaxSize> struct product_size_category
|
||||||
{
|
{
|
||||||
enum { is_large = MaxSize == Dynamic ||
|
enum {
|
||||||
|
#ifndef EIGEN_CUDA_ARCH
|
||||||
|
is_large = MaxSize == Dynamic ||
|
||||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||||
|
#else
|
||||||
|
is_large = 0,
|
||||||
|
#endif
|
||||||
value = is_large ? Large
|
value = is_large ? Large
|
||||||
: Size == 1 ? 1
|
: Size == 1 ? 1
|
||||||
: Small
|
: Small
|
||||||
@ -379,8 +394,6 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
|
|||||||
*
|
*
|
||||||
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
|
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
|
||||||
*/
|
*/
|
||||||
#ifndef __CUDACC__
|
|
||||||
|
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
template<typename OtherDerived>
|
template<typename OtherDerived>
|
||||||
inline const Product<Derived, OtherDerived>
|
inline const Product<Derived, OtherDerived>
|
||||||
@ -412,8 +425,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
|||||||
return Product<Derived, OtherDerived>(derived(), other.derived());
|
return Product<Derived, OtherDerived>(derived(), other.derived());
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __CUDACC__
|
|
||||||
|
|
||||||
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
|
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
|
||||||
*
|
*
|
||||||
* The returned product will behave like any other expressions: the coefficients of the product will be
|
* The returned product will behave like any other expressions: the coefficients of the product will be
|
||||||
|
@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
|
|||||||
/** \internal tries to do cache prefetching of \a addr */
|
/** \internal tries to do cache prefetching of \a addr */
|
||||||
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
#if defined(__LP64__)
|
#if defined(__LP64__)
|
||||||
// 64-bit pointer operand constraint for inlined asm
|
// 64-bit pointer operand constraint for inlined asm
|
||||||
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
||||||
@ -526,7 +526,7 @@ inline void palign(PacketType& first, const PacketType& second)
|
|||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
// Eigen+CUDA does not support complexes.
|
// Eigen+CUDA does not support complexes.
|
||||||
#ifndef __CUDACC__
|
#ifndef EIGEN_CUDACC
|
||||||
|
|
||||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||||
|
@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
|
|||||||
{
|
{
|
||||||
typedef traits<PlainObjectType> TraitsBase;
|
typedef traits<PlainObjectType> TraitsBase;
|
||||||
enum {
|
enum {
|
||||||
|
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
|
||||||
|
? PlainObjectType::ColsAtCompileTime
|
||||||
|
: PlainObjectType::RowsAtCompileTime,
|
||||||
|
|
||||||
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
|
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
|
||||||
? int(PlainObjectType::InnerStrideAtCompileTime)
|
? int(PlainObjectType::InnerStrideAtCompileTime)
|
||||||
: int(StrideType::InnerStrideAtCompileTime),
|
: int(StrideType::InnerStrideAtCompileTime),
|
||||||
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
|
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
|
||||||
? int(PlainObjectType::OuterStrideAtCompileTime)
|
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
|
||||||
|
? Dynamic
|
||||||
|
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
|
||||||
: int(StrideType::OuterStrideAtCompileTime),
|
: int(StrideType::OuterStrideAtCompileTime),
|
||||||
Alignment = int(MapOptions)&int(AlignedMask),
|
Alignment = int(MapOptions)&int(AlignedMask),
|
||||||
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
||||||
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
|
|||||||
inline Index outerStride() const
|
inline Index outerStride() const
|
||||||
{
|
{
|
||||||
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
||||||
: IsVectorAtCompileTime ? this->size()
|
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? internal::traits<Map>::OuterStrideAtCompileTime
|
||||||
: int(Flags)&RowMajorBit ? this->cols()
|
: IsVectorAtCompileTime ? (this->size() * innerStride())
|
||||||
: this->rows();
|
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
|
||||||
|
: (this->rows() * innerStride());
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructor in the fixed-size case.
|
/** Constructor in the fixed-size case.
|
||||||
|
@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
|
|||||||
|
|
||||||
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct real_impl<std::complex<T> >
|
struct real_impl<std::complex<T> >
|
||||||
{
|
{
|
||||||
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
|
|||||||
|
|
||||||
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct imag_impl<std::complex<T> >
|
struct imag_impl<std::complex<T> >
|
||||||
{
|
{
|
||||||
@ -778,7 +778,7 @@ EIGEN_DEVICE_FUNC
|
|||||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||||
isfinite_impl(const T& x)
|
isfinite_impl(const T& x)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
return (::isfinite)(x);
|
return (::isfinite)(x);
|
||||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||||
using std::isfinite;
|
using std::isfinite;
|
||||||
@ -793,7 +793,7 @@ EIGEN_DEVICE_FUNC
|
|||||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||||
isinf_impl(const T& x)
|
isinf_impl(const T& x)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
return (::isinf)(x);
|
return (::isinf)(x);
|
||||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||||
using std::isinf;
|
using std::isinf;
|
||||||
@ -808,7 +808,7 @@ EIGEN_DEVICE_FUNC
|
|||||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||||
isnan_impl(const T& x)
|
isnan_impl(const T& x)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
return (::isnan)(x);
|
return (::isnan)(x);
|
||||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||||
using std::isnan;
|
using std::isnan;
|
||||||
@ -874,7 +874,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
|
|||||||
|
|
||||||
namespace numext {
|
namespace numext {
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
template<typename T>
|
template<typename T>
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||||
@ -1088,7 +1088,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float log1p(const float &x) { return ::log1pf(x); }
|
float log1p(const float &x) { return ::log1pf(x); }
|
||||||
|
|
||||||
@ -1146,7 +1146,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float floor(const float &x) { return ::floorf(x); }
|
float floor(const float &x) { return ::floorf(x); }
|
||||||
|
|
||||||
@ -1167,7 +1167,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float ceil(const float &x) { return ::ceilf(x); }
|
float ceil(const float &x) { return ::ceilf(x); }
|
||||||
|
|
||||||
@ -1225,7 +1225,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
|
|||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float log(const float &x) { return ::logf(x); }
|
float log(const float &x) { return ::logf(x); }
|
||||||
|
|
||||||
@ -1253,7 +1253,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float abs(const float &x) { return ::fabsf(x); }
|
float abs(const float &x) { return ::fabsf(x); }
|
||||||
|
|
||||||
@ -1283,7 +1283,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float exp(const float &x) { return ::expf(x); }
|
float exp(const float &x) { return ::expf(x); }
|
||||||
|
|
||||||
@ -1303,7 +1303,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float expm1(const float &x) { return ::expm1f(x); }
|
float expm1(const float &x) { return ::expm1f(x); }
|
||||||
|
|
||||||
@ -1323,7 +1323,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float cos(const float &x) { return ::cosf(x); }
|
float cos(const float &x) { return ::cosf(x); }
|
||||||
|
|
||||||
@ -1343,7 +1343,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float sin(const float &x) { return ::sinf(x); }
|
float sin(const float &x) { return ::sinf(x); }
|
||||||
|
|
||||||
@ -1363,7 +1363,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float tan(const float &x) { return ::tanf(x); }
|
float tan(const float &x) { return ::tanf(x); }
|
||||||
|
|
||||||
@ -1378,13 +1378,14 @@ T acos(const T &x) {
|
|||||||
return acos(x);
|
return acos(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if EIGEN_HAS_CXX11_MATH
|
||||||
template<typename T>
|
template<typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
T acosh(const T &x) {
|
T acosh(const T &x) {
|
||||||
EIGEN_USING_STD_MATH(acosh);
|
EIGEN_USING_STD_MATH(acosh);
|
||||||
return acosh(x);
|
return acosh(x);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SYCL_DEVICE_ONLY__)
|
||||||
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
|
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
|
||||||
@ -1393,7 +1394,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
|
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float acos(const float &x) { return ::acosf(x); }
|
float acos(const float &x) { return ::acosf(x); }
|
||||||
|
|
||||||
@ -1408,12 +1409,14 @@ T asin(const T &x) {
|
|||||||
return asin(x);
|
return asin(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if EIGEN_HAS_CXX11_MATH
|
||||||
template<typename T>
|
template<typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
T asinh(const T &x) {
|
T asinh(const T &x) {
|
||||||
EIGEN_USING_STD_MATH(asinh);
|
EIGEN_USING_STD_MATH(asinh);
|
||||||
return asinh(x);
|
return asinh(x);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SYCL_DEVICE_ONLY__)
|
||||||
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
|
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
|
||||||
@ -1422,7 +1425,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
|
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float asin(const float &x) { return ::asinf(x); }
|
float asin(const float &x) { return ::asinf(x); }
|
||||||
|
|
||||||
@ -1437,12 +1440,14 @@ T atan(const T &x) {
|
|||||||
return atan(x);
|
return atan(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if EIGEN_HAS_CXX11_MATH
|
||||||
template<typename T>
|
template<typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
T atanh(const T &x) {
|
T atanh(const T &x) {
|
||||||
EIGEN_USING_STD_MATH(atanh);
|
EIGEN_USING_STD_MATH(atanh);
|
||||||
return atanh(x);
|
return atanh(x);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SYCL_DEVICE_ONLY__)
|
||||||
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
|
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
|
||||||
@ -1451,7 +1456,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
|
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float atan(const float &x) { return ::atanf(x); }
|
float atan(const float &x) { return ::atanf(x); }
|
||||||
|
|
||||||
@ -1472,7 +1477,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float cosh(const float &x) { return ::coshf(x); }
|
float cosh(const float &x) { return ::coshf(x); }
|
||||||
|
|
||||||
@ -1492,7 +1497,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float sinh(const float &x) { return ::sinhf(x); }
|
float sinh(const float &x) { return ::sinhf(x); }
|
||||||
|
|
||||||
@ -1510,12 +1515,12 @@ T tanh(const T &x) {
|
|||||||
#if defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SYCL_DEVICE_ONLY__)
|
||||||
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
||||||
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
||||||
#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
|
#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float tanh(const float &x) { return ::tanhf(x); }
|
float tanh(const float &x) { return ::tanhf(x); }
|
||||||
|
|
||||||
@ -1535,7 +1540,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
|
|||||||
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float fmod(const float& a, const float& b) {
|
float fmod(const float& a, const float& b) {
|
||||||
|
@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
Derived& operator-=(const MatrixBase<OtherDerived>& other);
|
Derived& operator-=(const MatrixBase<OtherDerived>& other);
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
|
||||||
template<typename OtherDerived>
|
template<typename OtherDerived>
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
const Product<Derived,OtherDerived,LazyProduct>
|
|
||||||
operator*(const MatrixBase<OtherDerived> &other) const
|
|
||||||
{ return this->lazyProduct(other); }
|
|
||||||
#else
|
|
||||||
|
|
||||||
template<typename OtherDerived>
|
|
||||||
const Product<Derived,OtherDerived>
|
const Product<Derived,OtherDerived>
|
||||||
operator*(const MatrixBase<OtherDerived> &other) const;
|
operator*(const MatrixBase<OtherDerived> &other) const;
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<typename OtherDerived>
|
template<typename OtherDerived>
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
const Product<Derived,OtherDerived,LazyProduct>
|
const Product<Derived,OtherDerived,LazyProduct>
|
||||||
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
|
|||||||
Derived& setIdentity();
|
Derived& setIdentity();
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
Derived& setIdentity(Index rows, Index cols);
|
Derived& setIdentity(Index rows, Index cols);
|
||||||
|
EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
|
||||||
|
EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
|
||||||
|
|
||||||
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
||||||
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
||||||
@ -305,7 +298,7 @@ template<typename Derived> class MatrixBase
|
|||||||
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
||||||
{ return cwiseNotEqual(other).any(); }
|
{ return cwiseNotEqual(other).any(); }
|
||||||
|
|
||||||
NoAlias<Derived,Eigen::MatrixBase > noalias();
|
NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
|
||||||
|
|
||||||
// TODO forceAlignedAccess is temporarily disabled
|
// TODO forceAlignedAccess is temporarily disabled
|
||||||
// Need to find a nicer workaround.
|
// Need to find a nicer workaround.
|
||||||
@ -437,8 +430,10 @@ template<typename Derived> class MatrixBase
|
|||||||
///////// Jacobi module /////////
|
///////// Jacobi module /////////
|
||||||
|
|
||||||
template<typename OtherScalar>
|
template<typename OtherScalar>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
||||||
template<typename OtherScalar>
|
template<typename OtherScalar>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
||||||
|
|
||||||
///////// SparseCore module /////////
|
///////// SparseCore module /////////
|
||||||
|
@ -33,6 +33,7 @@ class NoAlias
|
|||||||
public:
|
public:
|
||||||
typedef typename ExpressionType::Scalar Scalar;
|
typedef typename ExpressionType::Scalar Scalar;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
|
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
|
||||||
|
|
||||||
template<typename OtherDerived>
|
template<typename OtherDerived>
|
||||||
|
@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
|||||||
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
|
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
|
||||||
* \a data pointers.
|
* \a data pointers.
|
||||||
*
|
*
|
||||||
|
* Here is an example using strides:
|
||||||
|
* \include Matrix_Map_stride.cpp
|
||||||
|
* Output: \verbinclude Matrix_Map_stride.out
|
||||||
|
*
|
||||||
* \see class Map
|
* \see class Map
|
||||||
*/
|
*/
|
||||||
//@{
|
//@{
|
||||||
|
@ -851,7 +851,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
|||||||
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef __CUDACC__
|
#ifndef EIGEN_CUDACC
|
||||||
template<int LoadMode,typename PacketType>
|
template<int LoadMode,typename PacketType>
|
||||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||||
{
|
{
|
||||||
@ -895,7 +895,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
|||||||
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef __CUDACC__
|
#ifndef EIGEN_CUDACC
|
||||||
template<int LoadMode,typename PacketType>
|
template<int LoadMode,typename PacketType>
|
||||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||||
{
|
{
|
||||||
|
@ -16,7 +16,7 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||||
|
|
||||||
// Many std::complex methods such as operator+, operator-, operator* and
|
// Many std::complex methods such as operator+, operator-, operator* and
|
||||||
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
||||||
|
@ -140,7 +140,7 @@ struct half : public half_impl::half_base {
|
|||||||
|
|
||||||
namespace half_impl {
|
namespace half_impl {
|
||||||
|
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||||
|
|
||||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||||
@ -281,7 +281,7 @@ union FP32 {
|
|||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __float2half(ff);
|
return __float2half(ff);
|
||||||
|
|
||||||
#elif defined(EIGEN_HAS_FP16_C)
|
#elif defined(EIGEN_HAS_FP16_C)
|
||||||
@ -336,7 +336,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __half2float(h);
|
return __half2float(h);
|
||||||
|
|
||||||
#elif defined(EIGEN_HAS_FP16_C)
|
#elif defined(EIGEN_HAS_FP16_C)
|
||||||
@ -370,7 +370,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
|
|||||||
return (a.x & 0x7fff) == 0x7c00;
|
return (a.x & 0x7fff) == 0x7c00;
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||||
return __hisnan(a);
|
return __hisnan(a);
|
||||||
#else
|
#else
|
||||||
return (a.x & 0x7fff) > 0x7c00;
|
return (a.x & 0x7fff) > 0x7c00;
|
||||||
@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||||
return half(hexp(a));
|
return half(hexp(a));
|
||||||
#else
|
#else
|
||||||
return half(::expf(float(a)));
|
return half(::expf(float(a)));
|
||||||
@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
|
|||||||
return half(numext::expm1(float(a)));
|
return half(numext::expm1(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||||
return half(::hlog(a));
|
return half(::hlog(a));
|
||||||
#else
|
#else
|
||||||
return half(::logf(float(a)));
|
return half(::logf(float(a)));
|
||||||
@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
|||||||
return half(::log10f(float(a)));
|
return half(::log10f(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||||
return half(hsqrt(a));
|
return half(hsqrt(a));
|
||||||
#else
|
#else
|
||||||
return half(::sqrtf(float(a)));
|
return half(::sqrtf(float(a)));
|
||||||
@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
|||||||
return half(::tanhf(float(a)));
|
return half(::tanhf(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||||
return half(hfloor(a));
|
return half(hfloor(a));
|
||||||
#else
|
#else
|
||||||
return half(::floorf(float(a)));
|
return half(::floorf(float(a)));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||||
return half(hceil(a));
|
return half(hceil(a));
|
||||||
#else
|
#else
|
||||||
return half(::ceilf(float(a)));
|
return half(::ceilf(float(a)));
|
||||||
@ -446,7 +446,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||||
return __hlt(b, a) ? b : a;
|
return __hlt(b, a) ? b : a;
|
||||||
#else
|
#else
|
||||||
const float f1 = static_cast<float>(a);
|
const float f1 = static_cast<float>(a);
|
||||||
@ -455,7 +455,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||||
return __hlt(a, b) ? b : a;
|
return __hlt(a, b) ? b : a;
|
||||||
#else
|
#else
|
||||||
const float f1 = static_cast<float>(a);
|
const float f1 = static_cast<float>(a);
|
||||||
@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
|||||||
return Eigen::half(::expf(float(a)));
|
return Eigen::half(::expf(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||||
return Eigen::half(::hlog(a));
|
return Eigen::half(::hlog(a));
|
||||||
#else
|
#else
|
||||||
return Eigen::half(::logf(float(a)));
|
return Eigen::half(::logf(float(a)));
|
||||||
@ -610,14 +610,14 @@ struct hash<Eigen::half> {
|
|||||||
|
|
||||||
|
|
||||||
// Add the missing shfl_xor intrinsic
|
// Add the missing shfl_xor intrinsic
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||||
return Eigen::half_impl::raw_uint16_to_half(
|
return Eigen::half_impl::raw_uint16_to_half(
|
||||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||||
@ -625,7 +625,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace numext {
|
namespace numext {
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ namespace internal {
|
|||||||
// Make sure this is only available when targeting a GPU: we don't want to
|
// Make sure this is only available when targeting a GPU: we don't want to
|
||||||
// introduce conflicts between these packet_traits definitions and the ones
|
// introduce conflicts between these packet_traits definitions and the ones
|
||||||
// we'll use on the host side (SSE, AVX, ...)
|
// we'll use on the host side (SSE, AVX, ...)
|
||||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
float4 plog<float4>(const float4& a)
|
float4 plog<float4>(const float4& a)
|
||||||
{
|
{
|
||||||
|
@ -17,7 +17,7 @@ namespace internal {
|
|||||||
// Make sure this is only available when targeting a GPU: we don't want to
|
// Make sure this is only available when targeting a GPU: we don't want to
|
||||||
// introduce conflicts between these packet_traits definitions and the ones
|
// introduce conflicts between these packet_traits definitions and the ones
|
||||||
// we'll use on the host side (SSE, AVX, ...)
|
// we'll use on the host side (SSE, AVX, ...)
|
||||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||||
|
|
||||||
@ -196,7 +196,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
return __ldg((const float4*)from);
|
return __ldg((const float4*)from);
|
||||||
#else
|
#else
|
||||||
return make_float4(from[0], from[1], from[2], from[3]);
|
return make_float4(from[0], from[1], from[2], from[3]);
|
||||||
@ -204,7 +204,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
|||||||
}
|
}
|
||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
return __ldg((const double2*)from);
|
return __ldg((const double2*)from);
|
||||||
#else
|
#else
|
||||||
return make_double2(from[0], from[1]);
|
return make_double2(from[0], from[1]);
|
||||||
@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
|
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
|
||||||
#else
|
#else
|
||||||
return make_float4(from[0], from[1], from[2], from[3]);
|
return make_float4(from[0], from[1], from[2], from[3]);
|
||||||
@ -221,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
|
|||||||
}
|
}
|
||||||
template<>
|
template<>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
return make_double2(__ldg(from+0), __ldg(from+1));
|
return make_double2(__ldg(from+0), __ldg(from+1));
|
||||||
#else
|
#else
|
||||||
return make_double2(from[0], from[1]);
|
return make_double2(from[0], from[1]);
|
||||||
|
@ -15,7 +15,7 @@ namespace Eigen {
|
|||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// Most of the following operations require arch >= 3.0
|
// Most of the following operations require arch >= 3.0
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
|
|
||||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||||
|
|
||||||
@ -69,7 +69,7 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||||
#if __CUDA_ARCH__ >= 350
|
#if EIGEN_CUDA_ARCH >= 350
|
||||||
return __ldg((const half2*)from);
|
return __ldg((const half2*)from);
|
||||||
#else
|
#else
|
||||||
return __halves2half2(*(from+0), *(from+1));
|
return __halves2half2(*(from+0), *(from+1));
|
||||||
@ -78,7 +78,7 @@ template<>
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||||
#if __CUDA_ARCH__ >= 350
|
#if EIGEN_CUDA_ARCH >= 350
|
||||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||||
#else
|
#else
|
||||||
return __halves2half2(*(from+0), *(from+1));
|
return __halves2half2(*(from+0), *(from+1));
|
||||||
@ -116,7 +116,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||||
#else
|
#else
|
||||||
float f = __half2float(a) + 1.0f;
|
float f = __half2float(a) + 1.0f;
|
||||||
@ -125,7 +125,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hadd2(a, b);
|
return __hadd2(a, b);
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -139,7 +139,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hsub2(a, b);
|
return __hsub2(a, b);
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -153,7 +153,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hneg2(a);
|
return __hneg2(a);
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -165,7 +165,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
|||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hmul2(a, b);
|
return __hmul2(a, b);
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -179,7 +179,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hfma2(a, b, c);
|
return __hfma2(a, b, c);
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -225,7 +225,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hadd(__low2half(a), __high2half(a));
|
return __hadd(__low2half(a), __high2half(a));
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -235,7 +235,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
__half first = __low2half(a);
|
__half first = __low2half(a);
|
||||||
__half second = __high2half(a);
|
__half second = __high2half(a);
|
||||||
return __hgt(first, second) ? first : second;
|
return __hgt(first, second) ? first : second;
|
||||||
@ -247,7 +247,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
__half first = __low2half(a);
|
__half first = __low2half(a);
|
||||||
__half second = __high2half(a);
|
__half second = __high2half(a);
|
||||||
return __hlt(first, second) ? first : second;
|
return __hlt(first, second) ? first : second;
|
||||||
@ -259,7 +259,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||||
#if __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hmul(__low2half(a), __high2half(a));
|
return __hmul(__low2half(a), __high2half(a));
|
||||||
#else
|
#else
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
|||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE
|
template<> __device__ EIGEN_STRONG_INLINE
|
||||||
half2 plog<half2>(const half2& a) {
|
half2 plog<half2>(const half2& a) {
|
||||||
|
@ -19,7 +19,7 @@ struct scalar_cast_op<float, Eigen::half> {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||||
typedef Eigen::half result_type;
|
typedef Eigen::half result_type;
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __float2half(a);
|
return __float2half(a);
|
||||||
#else
|
#else
|
||||||
return Eigen::half(a);
|
return Eigen::half(a);
|
||||||
@ -37,7 +37,7 @@ struct scalar_cast_op<int, Eigen::half> {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||||
typedef Eigen::half result_type;
|
typedef Eigen::half result_type;
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __float2half(static_cast<float>(a));
|
return __float2half(static_cast<float>(a));
|
||||||
#else
|
#else
|
||||||
return Eigen::half(static_cast<float>(a));
|
return Eigen::half(static_cast<float>(a));
|
||||||
@ -55,7 +55,7 @@ struct scalar_cast_op<Eigen::half, float> {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||||
typedef float result_type;
|
typedef float result_type;
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
return __half2float(a);
|
return __half2float(a);
|
||||||
#else
|
#else
|
||||||
return static_cast<float>(a);
|
return static_cast<float>(a);
|
||||||
@ -69,7 +69,7 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct type_casting_traits<Eigen::half, float> {
|
struct type_casting_traits<Eigen::half, float> {
|
||||||
|
@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
|
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
|
||||||
{
|
{
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
// FIXME is there some kind of cuda::swap?
|
// FIXME is there some kind of cuda::swap?
|
||||||
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
|
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
|
||||||
#else
|
#else
|
||||||
|
@ -427,7 +427,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
template<typename Dst>
|
template<typename Dst>
|
||||||
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||||
{
|
{
|
||||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
|
||||||
|
// to determine the following heuristic.
|
||||||
|
// EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
|
||||||
|
// unless it has been specialized by the user or for a given architecture.
|
||||||
|
// Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
|
||||||
|
// I'm not sure it is still required.
|
||||||
|
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||||
lazyproduct::evalTo(dst, lhs, rhs);
|
lazyproduct::evalTo(dst, lhs, rhs);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -439,7 +445,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
template<typename Dst>
|
template<typename Dst>
|
||||||
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||||
{
|
{
|
||||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||||
lazyproduct::addTo(dst, lhs, rhs);
|
lazyproduct::addTo(dst, lhs, rhs);
|
||||||
else
|
else
|
||||||
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
|
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
|
||||||
@ -448,7 +454,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
template<typename Dst>
|
template<typename Dst>
|
||||||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||||
{
|
{
|
||||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||||
lazyproduct::subTo(dst, lhs, rhs);
|
lazyproduct::subTo(dst, lhs, rhs);
|
||||||
else
|
else
|
||||||
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
||||||
|
@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
|
|||||||
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
|
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
|
||||||
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
|
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
|
||||||
EIGTYPE beta(1); \
|
EIGTYPE beta(1); \
|
||||||
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
|
BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
|
|||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
|
||||||
|
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
|
||||||
|
#else
|
||||||
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
|
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
|
||||||
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
|
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO hanlde complex cases
|
// TODO hanlde complex cases
|
||||||
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
|
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
|
||||||
|
@ -46,7 +46,7 @@ namespace internal {
|
|||||||
|
|
||||||
// gemm specialization
|
// gemm specialization
|
||||||
|
|
||||||
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
|
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
|
||||||
template< \
|
template< \
|
||||||
typename Index, \
|
typename Index, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
@ -100,13 +100,20 @@ static void run(Index rows, Index cols, Index depth, \
|
|||||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||||
} else b = _rhs; \
|
} else b = _rhs; \
|
||||||
\
|
\
|
||||||
BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||||
}};
|
}};
|
||||||
|
|
||||||
GEMM_SPECIALIZATION(double, d, double, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
GEMM_SPECIALIZATION(float, f, float, s)
|
GEMM_SPECIALIZATION(double, d, double, dgemm)
|
||||||
GEMM_SPECIALIZATION(dcomplex, cd, double, z)
|
GEMM_SPECIALIZATION(float, f, float, sgemm)
|
||||||
GEMM_SPECIALIZATION(scomplex, cf, float, c)
|
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
|
||||||
|
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
|
||||||
|
#else
|
||||||
|
GEMM_SPECIALIZATION(double, d, double, dgemm_)
|
||||||
|
GEMM_SPECIALIZATION(float, f, float, sgemm_)
|
||||||
|
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
|
||||||
|
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespase internal
|
} // end namespase internal
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
|
|||||||
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
|
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
|
||||||
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
|
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
|
||||||
|
|
||||||
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
|
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
|
||||||
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
|
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
|
||||||
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
|
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
|
||||||
{ \
|
{ \
|
||||||
@ -113,14 +113,21 @@ static void run( \
|
|||||||
x_ptr=x_tmp.data(); \
|
x_ptr=x_tmp.data(); \
|
||||||
incx=1; \
|
incx=1; \
|
||||||
} else x_ptr=rhs; \
|
} else x_ptr=rhs; \
|
||||||
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||||
}\
|
}\
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s)
|
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
|
||||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
|
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
|
||||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c)
|
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
|
||||||
|
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
|
||||||
|
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
|
||||||
|
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
|
||||||
|
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespase internal
|
} // end namespase internal
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ namespace internal {
|
|||||||
|
|
||||||
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
|
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
|
||||||
|
|
||||||
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||||
template <typename Index, \
|
template <typename Index, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
int RhsStorageOrder, bool ConjugateRhs> \
|
int RhsStorageOrder, bool ConjugateRhs> \
|
||||||
@ -81,13 +81,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
|||||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||||
} else b = _rhs; \
|
} else b = _rhs; \
|
||||||
\
|
\
|
||||||
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||||
\
|
\
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||||
template <typename Index, \
|
template <typename Index, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
int RhsStorageOrder, bool ConjugateRhs> \
|
int RhsStorageOrder, bool ConjugateRhs> \
|
||||||
@ -144,20 +144,26 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
|||||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||||
\
|
\
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_SYMM_L(double, double, d, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_SYMM_L(float, float, f, s)
|
EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
|
||||||
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
|
EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
|
||||||
EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
|
EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
|
||||||
|
EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
|
||||||
|
EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
|
||||||
|
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
|
||||||
|
EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
|
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
|
||||||
|
|
||||||
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||||
template <typename Index, \
|
template <typename Index, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
int RhsStorageOrder, bool ConjugateRhs> \
|
int RhsStorageOrder, bool ConjugateRhs> \
|
||||||
@ -197,13 +203,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
|||||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||||
} else b = _lhs; \
|
} else b = _lhs; \
|
||||||
\
|
\
|
||||||
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||||
\
|
\
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||||
template <typename Index, \
|
template <typename Index, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
int RhsStorageOrder, bool ConjugateRhs> \
|
int RhsStorageOrder, bool ConjugateRhs> \
|
||||||
@ -259,15 +265,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
|||||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_SYMM_R(double, double, d, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_SYMM_R(float, float, f, s)
|
EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
|
||||||
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
|
EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
|
||||||
EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
|
EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
|
||||||
|
EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
|
||||||
|
EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
|
||||||
|
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
|
||||||
|
EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
|
||||||
|
#endif
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
|
|||||||
x_tmp=map_x.conjugate(); \
|
x_tmp=map_x.conjugate(); \
|
||||||
x_ptr=x_tmp.data(); \
|
x_ptr=x_tmp.data(); \
|
||||||
} else x_ptr=_rhs; \
|
} else x_ptr=_rhs; \
|
||||||
BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||||
}\
|
}\
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_MKL
|
||||||
|
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
|
||||||
|
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
|
||||||
|
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
|
||||||
|
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
|
||||||
|
#else
|
||||||
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
|
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
|
||||||
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
|
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
|
||||||
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
|
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
|
||||||
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
|
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
|||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||||
|
|
||||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
|
// To work around an "error: member reference base type 'Matrix<...>
|
||||||
|
// (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
|
||||||
|
// not a structure or union" compilation error in nvcc (tested V8.0.61),
|
||||||
|
// create a dummy internal::constructor_without_unaligned_array_assert
|
||||||
|
// object to pass to the Matrix constructor.
|
||||||
|
internal::constructor_without_unaligned_array_assert a;
|
||||||
|
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
|
||||||
triangularBuffer.setZero();
|
triangularBuffer.setZero();
|
||||||
if((Mode&ZeroDiag)==ZeroDiag)
|
if((Mode&ZeroDiag)==ZeroDiag)
|
||||||
triangularBuffer.diagonal().setZero();
|
triangularBuffer.diagonal().setZero();
|
||||||
@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
|||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||||
|
|
||||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
|
internal::constructor_without_unaligned_array_assert a;
|
||||||
|
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
|
||||||
triangularBuffer.setZero();
|
triangularBuffer.setZero();
|
||||||
if((Mode&ZeroDiag)==ZeroDiag)
|
if((Mode&ZeroDiag)==ZeroDiag)
|
||||||
triangularBuffer.diagonal().setZero();
|
triangularBuffer.diagonal().setZero();
|
||||||
|
@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
|
|||||||
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
|
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
|
||||||
|
|
||||||
// implements col-major += alpha * op(triangular) * op(general)
|
// implements col-major += alpha * op(triangular) * op(general)
|
||||||
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||||
template <typename Index, int Mode, \
|
template <typename Index, int Mode, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
int RhsStorageOrder, bool ConjugateRhs> \
|
int RhsStorageOrder, bool ConjugateRhs> \
|
||||||
@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
|||||||
} \
|
} \
|
||||||
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
||||||
/* call ?trmm*/ \
|
/* call ?trmm*/ \
|
||||||
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||||
\
|
\
|
||||||
/* Add op(a_triangular)*b into res*/ \
|
/* Add op(a_triangular)*b into res*/ \
|
||||||
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
||||||
@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
|||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_TRMM_L(double, double, d, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
|
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
|
||||||
EIGEN_BLAS_TRMM_L(float, float, f, s)
|
EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
|
||||||
EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
|
EIGEN_BLAS_TRMM_L(float, float, f, strmm)
|
||||||
|
EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
|
||||||
|
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
|
||||||
|
EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
|
||||||
|
EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
|
||||||
|
#endif
|
||||||
|
|
||||||
// implements col-major += alpha * op(general) * op(triangular)
|
// implements col-major += alpha * op(general) * op(triangular)
|
||||||
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||||
template <typename Index, int Mode, \
|
template <typename Index, int Mode, \
|
||||||
int LhsStorageOrder, bool ConjugateLhs, \
|
int LhsStorageOrder, bool ConjugateLhs, \
|
||||||
int RhsStorageOrder, bool ConjugateRhs> \
|
int RhsStorageOrder, bool ConjugateRhs> \
|
||||||
@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
|||||||
} \
|
} \
|
||||||
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
||||||
/* call ?trmm*/ \
|
/* call ?trmm*/ \
|
||||||
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||||
\
|
\
|
||||||
/* Add op(a_triangular)*b into res*/ \
|
/* Add op(a_triangular)*b into res*/ \
|
||||||
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
||||||
@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
|||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_TRMM_R(double, double, d, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
|
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
|
||||||
EIGEN_BLAS_TRMM_R(float, float, f, s)
|
EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
|
||||||
EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
|
EIGEN_BLAS_TRMM_R(float, float, f, strmm)
|
||||||
|
EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
|
||||||
|
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
|
||||||
|
EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
|
||||||
|
EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
|
||||||
|
#endif
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
|
|||||||
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
|
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
|
||||||
|
|
||||||
// implements col-major: res += alpha * op(triangular) * vector
|
// implements col-major: res += alpha * op(triangular) * vector
|
||||||
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
|
||||||
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
||||||
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
|
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
|
||||||
enum { \
|
enum { \
|
||||||
@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||||||
diag = IsUnitDiag ? 'U' : 'N'; \
|
diag = IsUnitDiag ? 'U' : 'N'; \
|
||||||
\
|
\
|
||||||
/* call ?TRMV*/ \
|
/* call ?TRMV*/ \
|
||||||
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||||
\
|
\
|
||||||
/* Add op(a_tr)rhs into res*/ \
|
/* Add op(a_tr)rhs into res*/ \
|
||||||
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||||
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
||||||
if (size<(std::max)(rows,cols)) { \
|
if (size<(std::max)(rows,cols)) { \
|
||||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||||
@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||||||
m = convert_index<BlasIndex>(size); \
|
m = convert_index<BlasIndex>(size); \
|
||||||
n = convert_index<BlasIndex>(cols-size); \
|
n = convert_index<BlasIndex>(cols-size); \
|
||||||
} \
|
} \
|
||||||
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_TRMV_CM(double, double, d, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
|
EIGEN_BLAS_TRMV_CM(double, double, d, d,)
|
||||||
EIGEN_BLAS_TRMV_CM(float, float, f, s)
|
EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
|
||||||
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c)
|
EIGEN_BLAS_TRMV_CM(float, float, f, s,)
|
||||||
|
EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
|
||||||
|
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
|
||||||
|
EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
|
||||||
|
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
|
||||||
|
#endif
|
||||||
|
|
||||||
// implements row-major: res += alpha * op(triangular) * vector
|
// implements row-major: res += alpha * op(triangular) * vector
|
||||||
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
|
||||||
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
||||||
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
|
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
|
||||||
enum { \
|
enum { \
|
||||||
@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||||||
diag = IsUnitDiag ? 'U' : 'N'; \
|
diag = IsUnitDiag ? 'U' : 'N'; \
|
||||||
\
|
\
|
||||||
/* call ?TRMV*/ \
|
/* call ?TRMV*/ \
|
||||||
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||||
\
|
\
|
||||||
/* Add op(a_tr)rhs into res*/ \
|
/* Add op(a_tr)rhs into res*/ \
|
||||||
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||||
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
||||||
if (size<(std::max)(rows,cols)) { \
|
if (size<(std::max)(rows,cols)) { \
|
||||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||||
@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
|||||||
m = convert_index<BlasIndex>(size); \
|
m = convert_index<BlasIndex>(size); \
|
||||||
n = convert_index<BlasIndex>(cols-size); \
|
n = convert_index<BlasIndex>(cols-size); \
|
||||||
} \
|
} \
|
||||||
BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_TRMV_RM(double, double, d, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
|
EIGEN_BLAS_TRMV_RM(double, double, d, d,)
|
||||||
EIGEN_BLAS_TRMV_RM(float, float, f, s)
|
EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
|
||||||
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c)
|
EIGEN_BLAS_TRMV_RM(float, float, f, s,)
|
||||||
|
EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
|
||||||
|
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
|
||||||
|
EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
|
||||||
|
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespase internal
|
} // end namespase internal
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ namespace Eigen {
|
|||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// implements LeftSide op(triangular)^-1 * general
|
// implements LeftSide op(triangular)^-1 * general
|
||||||
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
|
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||||
{ \
|
{ \
|
||||||
@ -80,18 +80,24 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
|
|||||||
} \
|
} \
|
||||||
if (IsUnitDiag) diag='U'; \
|
if (IsUnitDiag) diag='U'; \
|
||||||
/* call ?trsm*/ \
|
/* call ?trsm*/ \
|
||||||
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_TRSM_L(double, double, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_TRSM_L(dcomplex, double, z)
|
EIGEN_BLAS_TRSM_L(double, double, dtrsm)
|
||||||
EIGEN_BLAS_TRSM_L(float, float, s)
|
EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
|
||||||
EIGEN_BLAS_TRSM_L(scomplex, float, c)
|
EIGEN_BLAS_TRSM_L(float, float, strsm)
|
||||||
|
EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
|
||||||
|
EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
|
||||||
|
EIGEN_BLAS_TRSM_L(float, float, strsm_)
|
||||||
|
EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
|
||||||
|
#endif
|
||||||
|
|
||||||
// implements RightSide general * op(triangular)^-1
|
// implements RightSide general * op(triangular)^-1
|
||||||
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
|
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||||
{ \
|
{ \
|
||||||
@ -133,16 +139,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
|
|||||||
} \
|
} \
|
||||||
if (IsUnitDiag) diag='U'; \
|
if (IsUnitDiag) diag='U'; \
|
||||||
/* call ?trsm*/ \
|
/* call ?trsm*/ \
|
||||||
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||||
/*std::cout << "TRMS_L specialization!\n";*/ \
|
/*std::cout << "TRMS_L specialization!\n";*/ \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_BLAS_TRSM_R(double, double, d)
|
#ifdef EIGEN_USE_MKL
|
||||||
EIGEN_BLAS_TRSM_R(dcomplex, double, z)
|
EIGEN_BLAS_TRSM_R(double, double, dtrsm)
|
||||||
EIGEN_BLAS_TRSM_R(float, float, s)
|
EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
|
||||||
EIGEN_BLAS_TRSM_R(scomplex, float, c)
|
EIGEN_BLAS_TRSM_R(float, float, strsm)
|
||||||
|
EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
|
||||||
|
#else
|
||||||
|
EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
|
||||||
|
EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
|
||||||
|
EIGEN_BLAS_TRSM_R(float, float, strsm_)
|
||||||
|
EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
@ -55,6 +55,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined __NVCC__
|
#if defined __NVCC__
|
||||||
|
#pragma diag_suppress boolean_controlling_expr_is_constant
|
||||||
// Disable the "statement is unreachable" message
|
// Disable the "statement is unreachable" message
|
||||||
#pragma diag_suppress code_is_unreachable
|
#pragma diag_suppress code_is_unreachable
|
||||||
// Disable the "dynamic initialization in unreachable code" message
|
// Disable the "dynamic initialization in unreachable code" message
|
||||||
|
@ -49,10 +49,11 @@
|
|||||||
#define EIGEN_USE_LAPACKE
|
#define EIGEN_USE_LAPACKE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(EIGEN_USE_MKL_VML)
|
#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
|
||||||
#define EIGEN_USE_MKL
|
#define EIGEN_USE_MKL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined EIGEN_USE_MKL
|
#if defined EIGEN_USE_MKL
|
||||||
# include <mkl.h>
|
# include <mkl.h>
|
||||||
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
|
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
|
||||||
@ -108,6 +109,10 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
|
||||||
|
#include "../../misc/blas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
typedef std::complex<double> dcomplex;
|
typedef std::complex<double> dcomplex;
|
||||||
@ -121,8 +126,5 @@ typedef int BlasIndex;
|
|||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#if defined(EIGEN_USE_BLAS)
|
|
||||||
#include "../../misc/blas.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // EIGEN_MKL_SUPPORT_H
|
#endif // EIGEN_MKL_SUPPORT_H
|
||||||
|
@ -413,7 +413,7 @@
|
|||||||
// Does the compiler support variadic templates?
|
// Does the compiler support variadic templates?
|
||||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||||
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
|
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
|
||||||
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
||||||
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
||||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||||
@ -427,9 +427,9 @@
|
|||||||
// Does the compiler fully support const expressions? (as in c++14)
|
// Does the compiler fully support const expressions? (as in c++14)
|
||||||
#ifndef EIGEN_HAS_CONSTEXPR
|
#ifndef EIGEN_HAS_CONSTEXPR
|
||||||
|
|
||||||
#if defined(__CUDACC__)
|
#if defined(EIGEN_CUDACC)
|
||||||
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
|
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
|
||||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
|
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
|
||||||
#define EIGEN_HAS_CONSTEXPR 1
|
#define EIGEN_HAS_CONSTEXPR 1
|
||||||
#endif
|
#endif
|
||||||
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
||||||
@ -669,7 +669,7 @@ namespace Eigen {
|
|||||||
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
|
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
|
||||||
* vectorized and non-vectorized code.
|
* vectorized and non-vectorized code.
|
||||||
*/
|
*/
|
||||||
#if (defined __CUDACC__)
|
#if (defined EIGEN_CUDACC)
|
||||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
|
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
|
||||||
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
|
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
|
||||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
||||||
@ -837,7 +837,8 @@ namespace Eigen {
|
|||||||
// just an empty macro !
|
// just an empty macro !
|
||||||
#define EIGEN_EMPTY
|
#define EIGEN_EMPTY
|
||||||
|
|
||||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
|
||||||
|
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
||||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||||
using Base::operator =;
|
using Base::operator =;
|
||||||
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
||||||
@ -990,7 +991,7 @@ namespace Eigen {
|
|||||||
# define EIGEN_TRY try
|
# define EIGEN_TRY try
|
||||||
# define EIGEN_CATCH(X) catch (X)
|
# define EIGEN_CATCH(X) catch (X)
|
||||||
#else
|
#else
|
||||||
# ifdef __CUDA_ARCH__
|
# ifdef EIGEN_CUDA_ARCH
|
||||||
# define EIGEN_THROW_X(X) asm("trap;")
|
# define EIGEN_THROW_X(X) asm("trap;")
|
||||||
# define EIGEN_THROW asm("trap;")
|
# define EIGEN_THROW asm("trap;")
|
||||||
# else
|
# else
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#ifndef EIGEN_META_H
|
#ifndef EIGEN_META_H
|
||||||
#define EIGEN_META_H
|
#define EIGEN_META_H
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <math_constants.h>
|
#include <math_constants.h>
|
||||||
#endif
|
#endif
|
||||||
@ -169,7 +169,7 @@ template<bool Condition, typename T=void> struct enable_if;
|
|||||||
template<typename T> struct enable_if<true,T>
|
template<typename T> struct enable_if<true,T>
|
||||||
{ typedef T type; };
|
{ typedef T type; };
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
#if !defined(__FLT_EPSILON__)
|
#if !defined(__FLT_EPSILON__)
|
||||||
#define __FLT_EPSILON__ FLT_EPSILON
|
#define __FLT_EPSILON__ FLT_EPSILON
|
||||||
#define __DBL_EPSILON__ DBL_EPSILON
|
#define __DBL_EPSILON__ DBL_EPSILON
|
||||||
@ -523,13 +523,13 @@ template<typename T, typename U> struct scalar_product_traits
|
|||||||
|
|
||||||
namespace numext {
|
namespace numext {
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
||||||
#else
|
#else
|
||||||
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
|
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
using internal::device::numeric_limits;
|
using internal::device::numeric_limits;
|
||||||
#else
|
#else
|
||||||
using std::numeric_limits;
|
using std::numeric_limits;
|
||||||
|
@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const Quaterni
|
|||||||
if (n != Scalar(0))
|
if (n != Scalar(0))
|
||||||
{
|
{
|
||||||
m_angle = Scalar(2)*atan2(n, abs(q.w()));
|
m_angle = Scalar(2)*atan2(n, abs(q.w()));
|
||||||
if(q.w() < 0)
|
if(q.w() < Scalar(0))
|
||||||
n = -n;
|
n = -n;
|
||||||
m_axis = q.vec() / n;
|
m_axis = q.vec() / n;
|
||||||
}
|
}
|
||||||
|
@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
|
|||||||
typedef typename internal::traits<Derived>::Scalar Scalar;
|
typedef typename internal::traits<Derived>::Scalar Scalar;
|
||||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||||
typedef typename internal::traits<Derived>::Coefficients Coefficients;
|
typedef typename internal::traits<Derived>::Coefficients Coefficients;
|
||||||
|
typedef typename Coefficients::CoeffReturnType CoeffReturnType;
|
||||||
|
typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
|
||||||
|
Scalar&, CoeffReturnType>::type NonConstCoeffReturnType;
|
||||||
|
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
Flags = Eigen::internal::traits<Derived>::Flags
|
Flags = Eigen::internal::traits<Derived>::Flags
|
||||||
};
|
};
|
||||||
@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase<Derived, 3>
|
|||||||
|
|
||||||
|
|
||||||
/** \returns the \c x coefficient */
|
/** \returns the \c x coefficient */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); }
|
EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
|
||||||
/** \returns the \c y coefficient */
|
/** \returns the \c y coefficient */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); }
|
EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
|
||||||
/** \returns the \c z coefficient */
|
/** \returns the \c z coefficient */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); }
|
EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
|
||||||
/** \returns the \c w coefficient */
|
/** \returns the \c w coefficient */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); }
|
EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
|
||||||
|
|
||||||
/** \returns a reference to the \c x coefficient */
|
/** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
|
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
|
||||||
/** \returns a reference to the \c y coefficient */
|
/** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
|
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
|
||||||
/** \returns a reference to the \c z coefficient */
|
/** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
|
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
|
||||||
/** \returns a reference to the \c w coefficient */
|
/** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
|
||||||
EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
|
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
|
||||||
|
|
||||||
/** \returns a read-only vector expression of the imaginary part (x,y,z) */
|
/** \returns a read-only vector expression of the imaginary part (x,y,z) */
|
||||||
EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
|
EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
|
||||||
|
@ -37,17 +37,20 @@ template<typename Scalar> class JacobiRotation
|
|||||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||||
|
|
||||||
/** Default constructor without any initialization. */
|
/** Default constructor without any initialization. */
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
JacobiRotation() {}
|
JacobiRotation() {}
|
||||||
|
|
||||||
/** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
|
/** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
|
JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
|
||||||
|
|
||||||
Scalar& c() { return m_c; }
|
EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
|
||||||
Scalar c() const { return m_c; }
|
EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
|
||||||
Scalar& s() { return m_s; }
|
EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
|
||||||
Scalar s() const { return m_s; }
|
EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
|
||||||
|
|
||||||
/** Concatenates two planar rotation */
|
/** Concatenates two planar rotation */
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
JacobiRotation operator*(const JacobiRotation& other)
|
JacobiRotation operator*(const JacobiRotation& other)
|
||||||
{
|
{
|
||||||
using numext::conj;
|
using numext::conj;
|
||||||
@ -56,19 +59,26 @@ template<typename Scalar> class JacobiRotation
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the transposed transformation */
|
/** Returns the transposed transformation */
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
|
JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
|
||||||
|
|
||||||
/** Returns the adjoint transformation */
|
/** Returns the adjoint transformation */
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
|
JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
|
||||||
|
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
|
bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
|
bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
|
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
|
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
|
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
|
||||||
|
|
||||||
Scalar m_c, m_s;
|
Scalar m_c, m_s;
|
||||||
@ -264,6 +274,7 @@ namespace internal {
|
|||||||
* \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
|
* \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
|
||||||
*/
|
*/
|
||||||
template<typename VectorX, typename VectorY, typename OtherScalar>
|
template<typename VectorX, typename VectorY, typename OtherScalar>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
|
void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1211,7 +1211,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
|
|||||||
#endif
|
#endif
|
||||||
}//end deflation
|
}//end deflation
|
||||||
|
|
||||||
#ifndef __CUDACC__
|
#ifndef EIGEN_CUDACC
|
||||||
/** \svd_module
|
/** \svd_module
|
||||||
*
|
*
|
||||||
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
|
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
|
||||||
|
@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
|
|||||||
// eval without temporary
|
// eval without temporary
|
||||||
dst.resize(src.rows(), src.cols());
|
dst.resize(src.rows(), src.cols());
|
||||||
dst.setZero();
|
dst.setZero();
|
||||||
dst.reserve((std::max)(src.rows(),src.cols())*2);
|
dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
|
||||||
for (Index j=0; j<outerEvaluationSize; ++j)
|
for (Index j=0; j<outerEvaluationSize; ++j)
|
||||||
{
|
{
|
||||||
dst.startVec(j);
|
dst.startVec(j);
|
||||||
@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
|
|||||||
|
|
||||||
DstXprType temp(src.rows(), src.cols());
|
DstXprType temp(src.rows(), src.cols());
|
||||||
|
|
||||||
temp.reserve((std::max)(src.rows(),src.cols())*2);
|
temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
|
||||||
for (Index j=0; j<outerEvaluationSize; ++j)
|
for (Index j=0; j<outerEvaluationSize; ++j)
|
||||||
{
|
{
|
||||||
temp.startVec(j);
|
temp.startVec(j);
|
||||||
|
@ -327,7 +327,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
|
|||||||
internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
|
internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
|
||||||
m_isEtreeOk = true;
|
m_isEtreeOk = true;
|
||||||
|
|
||||||
m_R.resize(m, n);
|
m_R.resize(diagSize, n);
|
||||||
m_Q.resize(m, diagSize);
|
m_Q.resize(m, diagSize);
|
||||||
|
|
||||||
// Allocate space for nonzero elements : rough estimation
|
// Allocate space for nonzero elements : rough estimation
|
||||||
|
@ -120,6 +120,8 @@ run time. However, these assertions do cost time and can thus be turned off.
|
|||||||
- \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
|
- \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
|
||||||
temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
|
temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
|
||||||
this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
|
this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
|
||||||
|
- \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only,
|
||||||
|
and never called from device code.
|
||||||
|
|
||||||
|
|
||||||
- \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
|
- \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
|
||||||
|
@ -261,6 +261,8 @@ x.setIdentity();
|
|||||||
Vector3f::UnitX() // 1 0 0
|
Vector3f::UnitX() // 1 0 0
|
||||||
Vector3f::UnitY() // 0 1 0
|
Vector3f::UnitY() // 0 1 0
|
||||||
Vector3f::UnitZ() // 0 0 1
|
Vector3f::UnitZ() // 0 0 1
|
||||||
|
Vector4f::Unit(i)
|
||||||
|
x.setUnit(i);
|
||||||
\endcode
|
\endcode
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
@ -278,6 +280,7 @@ N/A
|
|||||||
|
|
||||||
|
|
||||||
VectorXf::Unit(size,i)
|
VectorXf::Unit(size,i)
|
||||||
|
x.setUnit(size,i);
|
||||||
VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
|
VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
|
||||||
== Vector4f::UnitY()
|
== Vector4f::UnitY()
|
||||||
\endcode
|
\endcode
|
||||||
@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
|
|||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes.
|
||||||
|
For instance:
|
||||||
|
\code
|
||||||
|
MatrixXi M(3,3);
|
||||||
|
M.setIdentity();
|
||||||
|
\endcode
|
||||||
|
|
||||||
\subsection QuickRef_Map Mapping external arrays
|
\subsection QuickRef_Map Mapping external arrays
|
||||||
|
|
||||||
|
@ -63,6 +63,8 @@ In addition you can choose which parts will be substituted by defining one or mu
|
|||||||
<tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
|
<tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined.
|
||||||
|
|
||||||
Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
|
Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
|
||||||
|
|
||||||
Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
|
Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
|
||||||
|
@ -3,18 +3,16 @@ namespace Eigen {
|
|||||||
|
|
||||||
/** \page TopicCUDA Using Eigen in CUDA kernels
|
/** \page TopicCUDA Using Eigen in CUDA kernels
|
||||||
|
|
||||||
\b Disclaimer: this page is about an \b experimental feature in %Eigen.
|
Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
|
||||||
|
This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
|
||||||
Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost).
|
This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only.
|
||||||
A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels.
|
However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
|
||||||
To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC.
|
It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
|
||||||
|
|
||||||
Known issues:
|
Known issues:
|
||||||
|
|
||||||
- \c nvcc with MS Visual Studio does not work (patch welcome)
|
- \c nvcc with MS Visual Studio does not work (patch welcome)
|
||||||
|
|
||||||
- \c nvcc with \c clang does not work (patch welcome)
|
|
||||||
|
|
||||||
- \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
|
- \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
|
||||||
\code
|
\code
|
||||||
// workaround issue between gcc >= 4.7 and cuda 5.5
|
// workaround issue between gcc >= 4.7 and cuda 5.5
|
||||||
|
7
doc/snippets/Matrix_Map_stride.cpp
Normal file
7
doc/snippets/Matrix_Map_stride.cpp
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
Matrix4i A;
|
||||||
|
A << 1, 2, 3, 4,
|
||||||
|
5, 6, 7, 8,
|
||||||
|
9, 10, 11, 12,
|
||||||
|
13, 14, 15, 16;
|
||||||
|
|
||||||
|
std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl;
|
@ -104,7 +104,8 @@ void test_bdcsvd()
|
|||||||
CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
|
CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
|
||||||
|
|
||||||
// Check that preallocation avoids subsequent mallocs
|
// Check that preallocation avoids subsequent mallocs
|
||||||
CALL_SUBTEST_9( svd_preallocate<void>() );
|
// Disbaled because not supported by BDCSVD
|
||||||
|
// CALL_SUBTEST_9( svd_preallocate<void>() );
|
||||||
|
|
||||||
CALL_SUBTEST_2( svd_underoverflow<void>() );
|
CALL_SUBTEST_2( svd_underoverflow<void>() );
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
|
|
||||||
#include <math_constants.h>
|
#include <math_constants.h>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -231,6 +231,19 @@ template<typename Scalar> void mapQuaternion(void){
|
|||||||
VERIFY_IS_APPROX(mq3*mq2, q3*q2);
|
VERIFY_IS_APPROX(mq3*mq2, q3*q2);
|
||||||
VERIFY_IS_APPROX(mcq1*mq2, q1*q2);
|
VERIFY_IS_APPROX(mcq1*mq2, q1*q2);
|
||||||
VERIFY_IS_APPROX(mcq3*mq2, q3*q2);
|
VERIFY_IS_APPROX(mcq3*mq2, q3*q2);
|
||||||
|
|
||||||
|
// Bug 1461, compilation issue with Map<const Quat>::w(), and other reference/constness checks:
|
||||||
|
VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum());
|
||||||
|
VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum());
|
||||||
|
mq3.w() = 1;
|
||||||
|
const Quaternionx& cq3(q3);
|
||||||
|
VERIFY( &cq3.x() == &q3.x() );
|
||||||
|
const MQuaternionUA& cmq3(mq3);
|
||||||
|
VERIFY( &cmq3.x() == &mq3.x() );
|
||||||
|
// FIXME the following should be ok. The problem is that currently the LValueBit flag
|
||||||
|
// is used to determine wether we can return a coeff by reference or not, which is not enough for Map<const ...>.
|
||||||
|
//const MCQuaternionUA& cmcq3(mcq3);
|
||||||
|
//VERIFY( &cmcq3.x() == &mcq3.x() );
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Scalar> void quaternionAlignment(void){
|
template<typename Scalar> void quaternionAlignment(void){
|
||||||
|
@ -58,7 +58,7 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
|
|||||||
MatrixType m = MatrixType::Random(rows,cols);
|
MatrixType m = MatrixType::Random(rows,cols);
|
||||||
Scalar s1 = internal::random<Scalar>();
|
Scalar s1 = internal::random<Scalar>();
|
||||||
|
|
||||||
Index arraysize = 2*(rows+4)*(cols+4);
|
Index arraysize = 4*(rows+4)*(cols+4);
|
||||||
|
|
||||||
Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
|
Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
|
||||||
Scalar* array1 = a_array1;
|
Scalar* array1 = a_array1;
|
||||||
@ -143,9 +143,62 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
|
|||||||
VERIFY_IS_APPROX(map,s1*m);
|
VERIFY_IS_APPROX(map,s1*m);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test inner stride and no outer stride
|
||||||
|
for(int k=0; k<2; ++k)
|
||||||
|
{
|
||||||
|
if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2)
|
||||||
|
break;
|
||||||
|
Scalar* array = (k==0 ? array1 : array2);
|
||||||
|
|
||||||
|
Map<MatrixType, Alignment, InnerStride<Dynamic> > map(array, rows, cols, InnerStride<Dynamic>(2));
|
||||||
|
map = m;
|
||||||
|
VERIFY(map.outerStride() == map.innerSize()*2);
|
||||||
|
for(int i = 0; i < m.outerSize(); ++i)
|
||||||
|
for(int j = 0; j < m.innerSize(); ++j)
|
||||||
|
{
|
||||||
|
VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j));
|
||||||
|
VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
|
||||||
|
}
|
||||||
|
VERIFY_IS_APPROX(s1*map,s1*m);
|
||||||
|
map *= s1;
|
||||||
|
VERIFY_IS_APPROX(map,s1*m);
|
||||||
|
}
|
||||||
|
|
||||||
internal::aligned_delete(a_array1, arraysize+1);
|
internal::aligned_delete(a_array1, arraysize+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Additional tests for inner-stride but no outer-stride
|
||||||
|
template<int>
|
||||||
|
void bug1453()
|
||||||
|
{
|
||||||
|
const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||||
|
typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
|
||||||
|
typedef Matrix<int,2,3,ColMajor> ColMatrix23i;
|
||||||
|
typedef Matrix<int,3,2,ColMajor> ColMatrix32i;
|
||||||
|
typedef Matrix<int,2,3,RowMajor> RowMatrix23i;
|
||||||
|
typedef Matrix<int,3,2,RowMajor> RowMatrix32i;
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||||
|
VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||||
|
VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||||
|
VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||||
|
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||||
|
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||||
|
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||||
|
VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||||
|
VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||||
|
VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||||
|
VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||||
|
VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||||
|
VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||||
|
}
|
||||||
|
|
||||||
void test_mapstride()
|
void test_mapstride()
|
||||||
{
|
{
|
||||||
for(int i = 0; i < g_repeat; i++) {
|
for(int i = 0; i < g_repeat; i++) {
|
||||||
@ -176,6 +229,8 @@ void test_mapstride()
|
|||||||
CALL_SUBTEST_6( map_class_matrix<Aligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
CALL_SUBTEST_6( map_class_matrix<Aligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
||||||
CALL_SUBTEST_6( map_class_matrix<Unaligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
CALL_SUBTEST_6( map_class_matrix<Unaligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
||||||
|
|
||||||
|
CALL_SUBTEST_5( bug1453<0>() );
|
||||||
|
|
||||||
TEST_SET_BUT_UNUSED_VARIABLE(maxn);
|
TEST_SET_BUT_UNUSED_VARIABLE(maxn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,10 @@ bool check_is_convertible(const From&, const To&)
|
|||||||
return internal::is_convertible<From,To>::value;
|
return internal::is_convertible<From,To>::value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct FooReturnType {
|
||||||
|
typedef int ReturnType;
|
||||||
|
};
|
||||||
|
|
||||||
void test_meta()
|
void test_meta()
|
||||||
{
|
{
|
||||||
VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
|
VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
|
||||||
@ -76,6 +80,11 @@ void test_meta()
|
|||||||
VERIFY(( check_is_convertible(A*B, A) ));
|
VERIFY(( check_is_convertible(A*B, A) ));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VERIFY(( internal::has_ReturnType<FooReturnType>::value ));
|
||||||
|
VERIFY(( internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
|
||||||
|
VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
|
||||||
|
VERIFY(( !internal::has_ReturnType<int>::value ));
|
||||||
|
|
||||||
VERIFY(internal::meta_sqrt<1>::ret == 1);
|
VERIFY(internal::meta_sqrt<1>::ret == 1);
|
||||||
#define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
|
#define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
|
||||||
VERIFY_META_SQRT(2);
|
VERIFY_META_SQRT(2);
|
||||||
|
@ -191,6 +191,24 @@ void testVectorType(const VectorType& base)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test setUnit()
|
||||||
|
if(m.size()>0)
|
||||||
|
{
|
||||||
|
for(Index k=0; k<10; ++k)
|
||||||
|
{
|
||||||
|
Index i = internal::random<Index>(0,m.size()-1);
|
||||||
|
m.setUnit(i);
|
||||||
|
VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) );
|
||||||
|
}
|
||||||
|
if(VectorType::SizeAtCompileTime==Dynamic)
|
||||||
|
{
|
||||||
|
Index i = internal::random<Index>(0,2*m.size()-1);
|
||||||
|
m.setUnit(2*m.size(),i);
|
||||||
|
VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename MatrixType>
|
template<typename MatrixType>
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
#define TEST_ENABLE_TEMPORARY_TRACKING
|
#define TEST_ENABLE_TEMPORARY_TRACKING
|
||||||
|
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||||
|
// ^^ see bug 1449
|
||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
@ -1382,5 +1382,5 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_USE_GPU and __CUDACC__
|
#endif // EIGEN_USE_GPU and EIGEN_CUDACC
|
||||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||||
|
@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
|
|
||||||
|
|
||||||
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
|
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
|
|
||||||
template <int StaticKernelSize>
|
template <int StaticKernelSize>
|
||||||
struct GetKernelSize {
|
struct GetKernelSize {
|
||||||
|
@ -211,7 +211,7 @@ struct GpuDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef EIGEN_CUDA_ARCH
|
||||||
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
||||||
stream_->stream());
|
stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
@ -239,7 +239,7 @@ struct GpuDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef EIGEN_CUDA_ARCH
|
||||||
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
|
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == cudaSuccess);
|
assert(err == cudaSuccess);
|
||||||
@ -265,7 +265,7 @@ struct GpuDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
||||||
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
|
||||||
cudaError_t err = cudaStreamSynchronize(stream_->stream());
|
cudaError_t err = cudaStreamSynchronize(stream_->stream());
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
std::cerr << "Error detected in CUDA stream: "
|
std::cerr << "Error detected in CUDA stream: "
|
||||||
@ -304,7 +304,7 @@ struct GpuDevice {
|
|||||||
// This function checks if the CUDA runtime recorded an error for the
|
// This function checks if the CUDA runtime recorded an error for the
|
||||||
// underlying stream device.
|
// underlying stream device.
|
||||||
inline bool ok() const {
|
inline bool ok() const {
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
cudaError_t error = cudaStreamQuery(stream_->stream());
|
cudaError_t error = cudaStreamQuery(stream_->stream());
|
||||||
return (error == cudaSuccess) || (error == cudaErrorNotReady);
|
return (error == cudaSuccess) || (error == cudaErrorNotReady);
|
||||||
#else
|
#else
|
||||||
@ -323,9 +323,9 @@ struct GpuDevice {
|
|||||||
|
|
||||||
|
|
||||||
// FIXME: Should be device and kernel specific.
|
// FIXME: Should be device and kernel specific.
|
||||||
#ifdef __CUDACC__
|
#ifdef EIGEN_CUDACC
|
||||||
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
|
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef EIGEN_CUDA_ARCH
|
||||||
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
|
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
|
||||||
EIGEN_UNUSED_VARIABLE(status)
|
EIGEN_UNUSED_VARIABLE(status)
|
||||||
assert(status == cudaSuccess);
|
assert(status == cudaSuccess);
|
||||||
|
@ -35,7 +35,7 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef EIGEN_CUDA_ARCH
|
||||||
// Running on the host CPU
|
// Running on the host CPU
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
@ -45,7 +45,7 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
// Running on the host CPU
|
// Running on the host CPU
|
||||||
return l1CacheSize();
|
return l1CacheSize();
|
||||||
#else
|
#else
|
||||||
@ -55,7 +55,7 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
// Running single threaded on the host CPU
|
// Running single threaded on the host CPU
|
||||||
return l3CacheSize();
|
return l3CacheSize();
|
||||||
#else
|
#else
|
||||||
@ -65,13 +65,13 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef EIGEN_CUDA_ARCH
|
||||||
// Running single threaded on the host CPU
|
// Running single threaded on the host CPU
|
||||||
// Should return an enum that encodes the ISA supported by the CPU
|
// Should return an enum that encodes the ISA supported by the CPU
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
// Running on a CUDA device
|
// Running on a CUDA device
|
||||||
return __CUDA_ARCH__ / 100;
|
return EIGEN_CUDA_ARCH / 100;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -131,7 +131,7 @@ T loadConstant(const T* address) {
|
|||||||
return *address;
|
return *address;
|
||||||
}
|
}
|
||||||
// Use the texture cache on CUDA devices whenever possible
|
// Use the texture cache on CUDA devices whenever possible
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float loadConstant(const float* address) {
|
float loadConstant(const float* address) {
|
||||||
return __ldg(address);
|
return __ldg(address);
|
||||||
|
@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(__CUDACC__)
|
#if defined(EIGEN_CUDACC)
|
||||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||||
struct EigenMetaKernelEval {
|
struct EigenMetaKernelEval {
|
||||||
static __device__ EIGEN_ALWAYS_INLINE
|
static __device__ EIGEN_ALWAYS_INLINE
|
||||||
@ -264,7 +264,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
|||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __CUDACC__
|
#endif // EIGEN_CUDACC
|
||||||
#endif // EIGEN_USE_GPU
|
#endif // EIGEN_USE_GPU
|
||||||
|
|
||||||
// SYCL Executor policy
|
// SYCL Executor policy
|
||||||
|
@ -35,7 +35,7 @@ namespace {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
return __clz(val);
|
return __clz(val);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::clz(val);
|
return cl::sycl::clz(val);
|
||||||
@ -53,7 +53,7 @@ namespace {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
return __clzll(val);
|
return __clzll(val);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::clz(val);
|
return cl::sycl::clz(val);
|
||||||
@ -90,7 +90,7 @@ namespace {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
return __umulhi(a, b);
|
return __umulhi(a, b);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
||||||
@ -101,7 +101,7 @@ namespace {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
return __umul64hi(a, b);
|
return __umul64hi(a, b);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
||||||
@ -124,7 +124,7 @@ namespace {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
struct DividerHelper<64, T> {
|
struct DividerHelper<64, T> {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
||||||
#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
||||||
#else
|
#else
|
||||||
const uint64_t shift = 1ULL << log_div;
|
const uint64_t shift = 1ULL << log_div;
|
||||||
@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
return (__umulhi(magic, n) >> shift);
|
return (__umulhi(magic, n) >> shift);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// SFINAE requires variadic templates
|
// SFINAE requires variadic templates
|
||||||
#ifndef __CUDACC__
|
#ifndef EIGEN_CUDACC
|
||||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
// SFINAE doesn't work for gcc <= 4.7
|
// SFINAE doesn't work for gcc <= 4.7
|
||||||
#ifdef EIGEN_COMP_GNUC
|
#ifdef EIGEN_COMP_GNUC
|
||||||
|
@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// For CUDA packet types when using a GpuDevice
|
// For CUDA packet types when using a GpuDevice
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
|
||||||
template <>
|
template <>
|
||||||
struct PacketType<half, GpuDevice> {
|
struct PacketType<half, GpuDevice> {
|
||||||
typedef half2 type;
|
typedef half2 type;
|
||||||
|
@ -16,7 +16,7 @@ namespace internal {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef EIGEN_CUDA_ARCH
|
||||||
// We don't support 3d kernels since we currently only use 1 and
|
// We don't support 3d kernels since we currently only use 1 and
|
||||||
// 2d kernels.
|
// 2d kernels.
|
||||||
assert(threadIdx.z == 0);
|
assert(threadIdx.z == 0);
|
||||||
|
@ -334,7 +334,7 @@ struct OuterReducer {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
template <int B, int N, typename S, typename R, typename I>
|
template <int B, int N, typename S, typename R, typename I>
|
||||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||||
|
|
||||||
@ -694,7 +694,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
||||||
#endif
|
#endif
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||||
#ifdef EIGEN_HAS_CUDA_FP16
|
#ifdef EIGEN_HAS_CUDA_FP16
|
||||||
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||||
@ -781,7 +781,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
Op m_reducer;
|
Op m_reducer;
|
||||||
|
|
||||||
// For full reductions
|
// For full reductions
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
||||||
static const bool RunningOnSycl = false;
|
static const bool RunningOnSycl = false;
|
||||||
#elif defined(EIGEN_USE_SYCL)
|
#elif defined(EIGEN_USE_SYCL)
|
||||||
|
@ -14,7 +14,7 @@ namespace Eigen {
|
|||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
// Full reducers for GPU, don't vectorize for now
|
// Full reducers for GPU, don't vectorize for now
|
||||||
|
|
||||||
// Reducer function that enables multiple cuda thread to safely accumulate at the same
|
// Reducer function that enables multiple cuda thread to safely accumulate at the same
|
||||||
@ -23,7 +23,7 @@ namespace internal {
|
|||||||
// updated the content of the output address it will try again.
|
// updated the content of the output address it will try again.
|
||||||
template <typename T, typename R>
|
template <typename T, typename R>
|
||||||
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if EIGEN_CUDA_ARCH >= 300
|
||||||
if (sizeof(T) == 4)
|
if (sizeof(T) == 4)
|
||||||
{
|
{
|
||||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||||
@ -102,7 +102,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
|
|||||||
|
|
||||||
template <>
|
template <>
|
||||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if EIGEN_CUDA_ARCH >= 300
|
||||||
atomicAdd(output, accum);
|
atomicAdd(output, accum);
|
||||||
#else // __CUDA_ARCH__ >= 300
|
#else // __CUDA_ARCH__ >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
assert(0 && "Shouldn't be called on unsupported device");
|
||||||
@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
|
|||||||
typename Reducer, typename Index>
|
typename Reducer, typename Index>
|
||||||
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
||||||
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
|
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if EIGEN_CUDA_ARCH >= 300
|
||||||
// Initialize the output value
|
// Initialize the output value
|
||||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
||||||
if (gridDim.x == 1) {
|
if (gridDim.x == 1) {
|
||||||
@ -372,7 +372,7 @@ template <int NumPerThread, typename Self,
|
|||||||
typename Reducer, typename Index>
|
typename Reducer, typename Index>
|
||||||
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||||
typename Self::CoeffReturnType* output) {
|
typename Self::CoeffReturnType* output) {
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if EIGEN_CUDA_ARCH >= 300
|
||||||
typedef typename Self::CoeffReturnType Type;
|
typedef typename Self::CoeffReturnType Type;
|
||||||
eigen_assert(blockDim.y == 1);
|
eigen_assert(blockDim.y == 1);
|
||||||
eigen_assert(blockDim.z == 1);
|
eigen_assert(blockDim.z == 1);
|
||||||
|
@ -242,7 +242,7 @@ struct ScanLauncher {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||||
|
|
||||||
// GPU implementation of scan
|
// GPU implementation of scan
|
||||||
// TODO(ibab) This placeholder implementation performs multiple scans in
|
// TODO(ibab) This placeholder implementation performs multiple scans in
|
||||||
@ -281,7 +281,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
|
|||||||
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
|
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // EIGEN_USE_GPU && __CUDACC__
|
#endif // EIGEN_USE_GPU && EIGEN_CUDACC
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
// The array class is only available starting with cxx11. Emulate our own here
|
// The array class is only available starting with cxx11. Emulate our own here
|
||||||
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
||||||
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
||||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
|
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
template <typename T, size_t n> class array {
|
template <typename T, size_t n> class array {
|
||||||
|
@ -121,7 +121,7 @@ template <>
|
|||||||
struct lgamma_impl<float> {
|
struct lgamma_impl<float> {
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static EIGEN_STRONG_INLINE float run(float x) {
|
static EIGEN_STRONG_INLINE float run(float x) {
|
||||||
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||||
int dummy;
|
int dummy;
|
||||||
return ::lgammaf_r(x, &dummy);
|
return ::lgammaf_r(x, &dummy);
|
||||||
#else
|
#else
|
||||||
@ -134,7 +134,7 @@ template <>
|
|||||||
struct lgamma_impl<double> {
|
struct lgamma_impl<double> {
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static EIGEN_STRONG_INLINE double run(double x) {
|
static EIGEN_STRONG_INLINE double run(double x) {
|
||||||
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||||
int dummy;
|
int dummy;
|
||||||
return ::lgamma_r(x, &dummy);
|
return ::lgamma_r(x, &dummy);
|
||||||
#else
|
#else
|
||||||
|
@ -17,7 +17,7 @@ namespace internal {
|
|||||||
// Make sure this is only available when targeting a GPU: we don't want to
|
// Make sure this is only available when targeting a GPU: we don't want to
|
||||||
// introduce conflicts between these packet_traits definitions and the ones
|
// introduce conflicts between these packet_traits definitions and the ones
|
||||||
// we'll use on the host side (SSE, AVX, ...)
|
// we'll use on the host side (SSE, AVX, ...)
|
||||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
float4 plgamma<float4>(const float4& a)
|
float4 plgamma<float4>(const float4& a)
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex
|
#define EIGEN_TEST_FUNC cxx11_tensor_complex
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
|
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
|
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if dEIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
#if EIGEN_CUDACC_VER >= 70500
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user