Merged eigen/eigen into default

This commit is contained in:
Konstantinos Margaritis 2017-08-24 12:24:01 +03:00
commit 1affe3d8df
83 changed files with 620 additions and 288 deletions

View File

@ -9,6 +9,7 @@
#define EIGEN_CHOLESKY_MODULE_H
#include "Core"
#include "Jacobi"
#include "src/Core/util/DisableStupidWarnings.h"
@ -31,7 +32,11 @@
#include "src/Cholesky/LLT.h"
#include "src/Cholesky/LDLT.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/Cholesky/LLT_LAPACKE.h"
#endif

View File

@ -14,8 +14,24 @@
// first thing Eigen does: stop the compiler from committing suicide
#include "src/Core/util/DisableStupidWarnings.h"
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
#define EIGEN_CUDACC __CUDACC__
#endif
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
#elif defined(__CUDACC_VER__)
#define EIGEN_CUDACC_VER __CUDACC_VER__
#else
#define EIGEN_CUDACC_VER 0
#endif
// Handle NVCC/CUDA/SYCL
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
// Do not try asserts on CUDA and SYCL!
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
@ -30,7 +46,7 @@
#endif
// All functions callable from CUDA code must be qualified with __device__
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
// Do not try to vectorize on CUDA and SYCL!
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
@ -47,16 +63,20 @@
#define EIGEN_DEVICE_FUNC
#endif
#ifdef __NVCC__
#define EIGEN_DONT_VECTORIZE
#endif
// When compiling CUDA device code with NVCC, pull in math functions from the
// global namespace. In host mode, and when device doee with clang, use the
// std versions.
#if defined(__CUDA_ARCH__) && defined(__NVCC__)
#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#define EIGEN_EXCEPTIONS
#endif
@ -233,10 +253,10 @@
#define EIGEN_HAS_FP16_C
#endif
#if defined __CUDACC__
#if defined EIGEN_CUDACC
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif

View File

@ -45,7 +45,11 @@
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"

View File

@ -28,7 +28,11 @@
#include "src/LU/FullPivLU.h"
#include "src/LU/PartialPivLU.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/LU/PartialPivLU_LAPACKE.h"
#endif
#include "src/LU/Determinant.h"

View File

@ -36,7 +36,11 @@
#include "src/QR/ColPivHouseholderQR.h"
#include "src/QR/CompleteOrthogonalDecomposition.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/QR/HouseholderQR_LAPACKE.h"
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
#endif

View File

@ -37,7 +37,11 @@
#include "src/SVD/JacobiSVD.h"
#include "src/SVD/BDCSVD.h"
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/SVD/JacobiSVD_LAPACKE.h"
#endif

View File

@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative.
* \c NumericalIssue if the factorization failed because of a zero pivot.
*/
ComputationInfo info() const
{

View File

@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
*
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
* The other triangular part won't be read.
* The other triangular part won't be read.
*
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
* matrix A such that A = LL^* = U^*U, where L is lower triangular.
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out
*
* \b Performance: for best performance, it is recommended to use a column-major storage format
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
* step, and rank-updates can be up to 3 times slower.
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
*
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
* Therefore, the strict lower part does not have to store correct values.
*
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
* the strict lower part does not have to store correct values.
*/
template<typename _MatrixType, int _UpLo> class LLT
{
public:
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
}
template<typename Derived>
void solveInPlace(MatrixBase<Derived> &bAndX) const;
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
template<typename InputType>
LLT& compute(const EigenBase<InputType>& matrix);
@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative.
* \c NumericalIssue if the matrix.appears not to be positive definite.
*/
ComputationInfo info() const
{
@ -424,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
eigen_assert(a.rows()==a.cols());
const Index size = a.rows();
m_matrix.resize(size, size);
m_matrix = a.derived();
if (!internal::is_same_dense(m_matrix, a.derived()))
m_matrix = a.derived();
// Compute matrix L1 norm = max abs column sum.
m_l1_norm = RealScalar(0);
@ -484,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
*
* This version avoids a copy when the right hand side matrix b is not needed anymore.
*
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* \sa LLT::solve(), MatrixBase::llt()
*/
template<typename MatrixType, int _UpLo>
template<typename Derived>
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
{
eigen_assert(m_isInitialized && "LLT is not initialized.");
eigen_assert(m_matrix.rows()==bAndX.rows());

View File

@ -861,6 +861,42 @@ template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
{ return Derived::Unit(3); }
/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
*
* \param i index of the unique coefficient to be set to 1
*
* \only_for_vectors
*
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
*/
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
eigen_assert(i<size());
derived().setZero();
derived().coeffRef(i) = Scalar(1);
return derived();
}
/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
*
* \param newSize the new size of the vector
* \param i index of the unique coefficient to be set to 1
*
* \only_for_vectors
*
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
*/
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
eigen_assert(i<newSize);
derived().resize(newSize);
return setUnit(i);
}
} // end namespace Eigen
#endif // EIGEN_CWISE_NULLARY_OP_H

View File

@ -18,18 +18,33 @@ enum {
Small = 3
};
// Define the threshold value to fallback from the generic matrix-matrix product
// implementation (heavy) to the lightweight coeff-based product one.
// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
// in products/GeneralMatrixMatrix.h for more details.
// TODO This threshold should also be used in the compile-time selector below.
#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
// This default value has been obtained on a Haswell architecture.
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
#endif
namespace internal {
template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category
{
enum { is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
value = is_large ? Large
: Size == 1 ? 1
: Small
enum {
#ifndef EIGEN_CUDA_ARCH
is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
#else
is_large = 0,
#endif
value = is_large ? Large
: Size == 1 ? 1
: Small
};
};
@ -379,8 +394,6 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
*
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
*/
#ifndef __CUDACC__
template<typename Derived>
template<typename OtherDerived>
inline const Product<Derived, OtherDerived>
@ -412,8 +425,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
return Product<Derived, OtherDerived>(derived(), other.derived());
}
#endif // __CUDACC__
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
*
* The returned product will behave like any other expressions: the coefficients of the product will be

View File

@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
/** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
{
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
#if defined(__LP64__)
// 64-bit pointer operand constraint for inlined asm
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@ -526,7 +526,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/
// Eigen+CUDA does not support complexes.
#ifndef __CUDACC__
#ifndef EIGEN_CUDACC
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

View File

@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
{
typedef traits<PlainObjectType> TraitsBase;
enum {
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
? PlainObjectType::ColsAtCompileTime
: PlainObjectType::RowsAtCompileTime,
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
? int(PlainObjectType::InnerStrideAtCompileTime)
: int(StrideType::InnerStrideAtCompileTime),
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
? int(PlainObjectType::OuterStrideAtCompileTime)
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
? Dynamic
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
: int(StrideType::OuterStrideAtCompileTime),
Alignment = int(MapOptions)&int(AlignedMask),
Flags0 = TraitsBase::Flags & (~NestByRefBit),
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
inline Index outerStride() const
{
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
: IsVectorAtCompileTime ? this->size()
: int(Flags)&RowMajorBit ? this->cols()
: this->rows();
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? internal::traits<Map>::OuterStrideAtCompileTime
: IsVectorAtCompileTime ? (this->size() * innerStride())
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
: (this->rows() * innerStride());
}
/** Constructor in the fixed-size case.

View File

@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
template<typename T>
struct real_impl<std::complex<T> >
{
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
template<typename T>
struct imag_impl<std::complex<T> >
{
@ -778,7 +778,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x)
{
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite;
@ -793,7 +793,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x)
{
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf;
@ -808,7 +808,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x)
{
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan;
@ -874,7 +874,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext {
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@ -1088,7 +1088,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); }
@ -1146,7 +1146,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); }
@ -1167,7 +1167,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); }
@ -1225,7 +1225,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); }
@ -1253,7 +1253,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); }
@ -1283,7 +1283,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); }
@ -1303,7 +1303,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); }
@ -1323,7 +1323,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); }
@ -1343,7 +1343,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); }
@ -1363,7 +1363,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); }
@ -1378,13 +1378,14 @@ T acos(const T &x) {
return acos(x);
}
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T acosh(const T &x) {
EIGEN_USING_STD_MATH(acosh);
return acosh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
@ -1393,7 +1394,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); }
@ -1408,12 +1409,14 @@ T asin(const T &x) {
return asin(x);
}
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T asinh(const T &x) {
EIGEN_USING_STD_MATH(asinh);
return asinh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
@ -1422,7 +1425,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); }
@ -1437,12 +1440,14 @@ T atan(const T &x) {
return atan(x);
}
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T atanh(const T &x) {
EIGEN_USING_STD_MATH(atanh);
return atanh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
@ -1451,7 +1456,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); }
@ -1472,7 +1477,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); }
@ -1492,7 +1497,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); }
@ -1510,12 +1515,12 @@ T tanh(const T &x) {
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); }
@ -1535,7 +1540,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) {

View File

@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const MatrixBase<OtherDerived>& other);
#ifdef __CUDACC__
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
operator*(const MatrixBase<OtherDerived> &other) const
{ return this->lazyProduct(other); }
#else
template<typename OtherDerived>
const Product<Derived,OtherDerived>
operator*(const MatrixBase<OtherDerived> &other) const;
#endif
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
Derived& setIdentity();
EIGEN_DEVICE_FUNC
Derived& setIdentity(Index rows, Index cols);
EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@ -305,7 +298,7 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
{ return cwiseNotEqual(other).any(); }
NoAlias<Derived,Eigen::MatrixBase > noalias();
NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
// TODO forceAlignedAccess is temporarily disabled
// Need to find a nicer workaround.
@ -437,8 +430,10 @@ template<typename Derived> class MatrixBase
///////// Jacobi module /////////
template<typename OtherScalar>
EIGEN_DEVICE_FUNC
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
template<typename OtherScalar>
EIGEN_DEVICE_FUNC
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
///////// SparseCore module /////////

View File

@ -33,6 +33,7 @@ class NoAlias
public:
typedef typename ExpressionType::Scalar Scalar;
EIGEN_DEVICE_FUNC
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
template<typename OtherDerived>

View File

@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers.
*
* Here is an example using strides:
* \include Matrix_Map_stride.cpp
* Output: \verbinclude Matrix_Map_stride.out
*
* \see class Map
*/
//@{

View File

@ -851,7 +851,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
}
#ifndef __CUDACC__
#ifndef EIGEN_CUDACC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{
@ -895,7 +895,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
}
#ifndef __CUDACC__
#ifndef EIGEN_CUDACC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{

View File

@ -16,7 +16,7 @@ namespace Eigen {
namespace internal {
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device

View File

@ -140,7 +140,7 @@ struct half : public half_impl::half_base {
namespace half_impl {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
@ -281,7 +281,7 @@ union FP32 {
};
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __float2half(ff);
#elif defined(EIGEN_HAS_FP16_C)
@ -336,7 +336,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
@ -370,7 +370,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hisnan(a);
#else
return (a.x & 0x7fff) > 0x7c00;
@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hexp(a));
#else
return half(::expf(float(a)));
@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return half(::hlog(a));
#else
return half(::logf(float(a)));
@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
return half(hceil(a));
#else
return half(::ceilf(float(a)));
@ -446,7 +446,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
@ -455,7 +455,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return Eigen::half(::hlog(a));
#else
return Eigen::half(::logf(float(a)));
@ -610,14 +610,14 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
}
#endif
// ldg() has an overload for __half, but we also need one for Eigen::half.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
@ -625,7 +625,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
namespace Eigen {
namespace numext {

View File

@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a)
{

View File

@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
@ -196,7 +196,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const float4*)from);
#else
return make_float4(from[0], from[1], from[2], from[3]);
@ -204,7 +204,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const double2*)from);
#else
return make_double2(from[0], from[1]);
@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@ -221,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_double2(__ldg(from+0), __ldg(from+1));
#else
return make_double2(from[0], from[1]);

View File

@ -15,7 +15,7 @@ namespace Eigen {
namespace internal {
// Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
template<> struct is_arithmetic<half2> { enum { value = true }; };
@ -69,7 +69,7 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
#if EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from);
#else
return __halves2half2(*(from+0), *(from+1));
@ -78,7 +78,7 @@ template<>
template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
#if EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
return __halves2half2(*(from+0), *(from+1));
@ -116,7 +116,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
}
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
@ -125,7 +125,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
}
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@ -139,7 +139,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hsub2(a, b);
#else
float a1 = __low2float(a);
@ -153,7 +153,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hneg2(a);
#else
float a1 = __low2float(a);
@ -165,7 +165,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@ -179,7 +179,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
@ -225,7 +225,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
@ -235,7 +235,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
@ -247,7 +247,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
@ -259,7 +259,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
#if EIGEN_CUDA_ARCH >= 530
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
template<> __device__ EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) {

View File

@ -19,7 +19,7 @@ struct scalar_cast_op<float, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __float2half(a);
#else
return Eigen::half(a);
@ -37,7 +37,7 @@ struct scalar_cast_op<int, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __float2half(static_cast<float>(a));
#else
return Eigen::half(static_cast<float>(a));
@ -55,7 +55,7 @@ struct scalar_cast_op<Eigen::half, float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef float result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(a);
#else
return static_cast<float>(a);
@ -69,7 +69,7 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
template <>
struct type_casting_traits<Eigen::half, float> {

View File

@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
{
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
// FIXME is there some kind of cuda::swap?
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
#else

View File

@ -427,7 +427,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
template<typename Dst>
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
// to determine the following heuristic.
// EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
// unless it has been specialized by the user or for a given architecture.
// Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
// I'm not sure it is still required.
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::evalTo(dst, lhs, rhs);
else
{
@ -439,7 +445,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
template<typename Dst>
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::addTo(dst, lhs, rhs);
else
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
@ -448,7 +454,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
template<typename Dst>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
lazyproduct::subTo(dst, lhs, rhs);
else
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));

View File

@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
EIGTYPE beta(1); \
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
} \
};
@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
} \
};
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
#else
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
#endif
// TODO hanlde complex cases
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)

View File

@ -46,7 +46,7 @@ namespace internal {
// gemm specialization
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
template< \
typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
@ -100,13 +100,20 @@ static void run(Index rows, Index cols, Index depth, \
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \
\
BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
}};
GEMM_SPECIALIZATION(double, d, double, d)
GEMM_SPECIALIZATION(float, f, float, s)
GEMM_SPECIALIZATION(dcomplex, cd, double, z)
GEMM_SPECIALIZATION(scomplex, cf, float, c)
#ifdef EIGEN_USE_MKL
GEMM_SPECIALIZATION(double, d, double, dgemm)
GEMM_SPECIALIZATION(float, f, float, sgemm)
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
#else
GEMM_SPECIALIZATION(double, d, double, dgemm_)
GEMM_SPECIALIZATION(float, f, float, sgemm_)
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
#endif
} // end namespase internal

View File

@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
{ \
@ -113,14 +113,21 @@ static void run( \
x_ptr=x_tmp.data(); \
incx=1; \
} else x_ptr=rhs; \
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\
};
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
#else
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
#endif
} // end namespase internal

View File

@ -40,7 +40,7 @@ namespace internal {
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@ -81,13 +81,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _rhs; \
\
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@ -144,20 +144,26 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \
\
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
EIGEN_BLAS_SYMM_L(double, double, d, d)
EIGEN_BLAS_SYMM_L(float, float, f, s)
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
#else
EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
#endif
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@ -197,13 +203,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} else b = _lhs; \
\
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
\
} \
};
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@ -259,15 +265,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
} \
\
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
} \
};
EIGEN_BLAS_SYMM_R(double, double, d, d)
EIGEN_BLAS_SYMM_R(float, float, f, s)
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
#else
EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
#endif
} // end namespace internal
} // end namespace Eigen

View File

@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
x_tmp=map_x.conjugate(); \
x_ptr=x_tmp.data(); \
} else x_ptr=_rhs; \
BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
}\
};
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
#else
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
#endif
} // end namespace internal

View File

@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
// To work around an "error: member reference base type 'Matrix<...>
// (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
// not a structure or union" compilation error in nvcc (tested V8.0.61),
// create a dummy internal::constructor_without_unaligned_array_assert
// object to pass to the Matrix constructor.
internal::constructor_without_unaligned_array_assert a;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero();
@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
internal::constructor_without_unaligned_array_assert a;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
triangularBuffer.setZero();
if((Mode&ZeroDiag)==ZeroDiag)
triangularBuffer.diagonal().setZero();

View File

@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
// implements col-major += alpha * op(triangular) * op(general)
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\
/* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
} \
};
EIGEN_BLAS_TRMM_L(double, double, d, d)
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
EIGEN_BLAS_TRMM_L(float, float, f, s)
EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
EIGEN_BLAS_TRMM_L(float, float, f, strmm)
EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
#else
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
#endif
// implements col-major += alpha * op(general) * op(triangular)
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
template <typename Index, int Mode, \
int LhsStorageOrder, bool ConjugateLhs, \
int RhsStorageOrder, bool ConjugateRhs> \
@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
/* call ?trmm*/ \
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
\
/* Add op(a_triangular)*b into res*/ \
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
} \
};
EIGEN_BLAS_TRMM_R(double, double, d, d)
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
EIGEN_BLAS_TRMM_R(float, float, f, s)
EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
EIGEN_BLAS_TRMM_R(float, float, f, strmm)
EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
#else
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
#endif
} // end namespace internal
} // end namespace Eigen

View File

@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
// implements col-major: res += alpha * op(triangular) * vector
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
enum { \
@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \
} \
};
EIGEN_BLAS_TRMV_CM(double, double, d, d)
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
EIGEN_BLAS_TRMV_CM(float, float, f, s)
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMV_CM(double, double, d, d,)
EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
EIGEN_BLAS_TRMV_CM(float, float, f, s,)
EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
#else
EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
#endif
// implements row-major: res += alpha * op(triangular) * vector
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
enum { \
@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
diag = IsUnitDiag ? 'U' : 'N'; \
\
/* call ?TRMV*/ \
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
\
/* Add op(a_tr)rhs into res*/ \
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
if (size<(std::max)(rows,cols)) { \
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
m = convert_index<BlasIndex>(size); \
n = convert_index<BlasIndex>(cols-size); \
} \
BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
} \
} \
};
EIGEN_BLAS_TRMV_RM(double, double, d, d)
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
EIGEN_BLAS_TRMV_RM(float, float, f, s)
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRMV_RM(double, double, d, d,)
EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
EIGEN_BLAS_TRMV_RM(float, float, f, s,)
EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
#else
EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
#endif
} // end namespase internal

View File

@ -38,7 +38,7 @@ namespace Eigen {
namespace internal {
// implements LeftSide op(triangular)^-1 * general
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
{ \
@ -80,18 +80,24 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
} \
if (IsUnitDiag) diag='U'; \
/* call ?trsm*/ \
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
} \
};
EIGEN_BLAS_TRSM_L(double, double, d)
EIGEN_BLAS_TRSM_L(dcomplex, double, z)
EIGEN_BLAS_TRSM_L(float, float, s)
EIGEN_BLAS_TRSM_L(scomplex, float, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRSM_L(double, double, dtrsm)
EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
EIGEN_BLAS_TRSM_L(float, float, strsm)
EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
#else
EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
EIGEN_BLAS_TRSM_L(float, float, strsm_)
EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
#endif
// implements RightSide general * op(triangular)^-1
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
{ \
@ -133,16 +139,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
} \
if (IsUnitDiag) diag='U'; \
/* call ?trsm*/ \
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
/*std::cout << "TRMS_L specialization!\n";*/ \
} \
};
EIGEN_BLAS_TRSM_R(double, double, d)
EIGEN_BLAS_TRSM_R(dcomplex, double, z)
EIGEN_BLAS_TRSM_R(float, float, s)
EIGEN_BLAS_TRSM_R(scomplex, float, c)
#ifdef EIGEN_USE_MKL
EIGEN_BLAS_TRSM_R(double, double, dtrsm)
EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
EIGEN_BLAS_TRSM_R(float, float, strsm)
EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
#else
EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
EIGEN_BLAS_TRSM_R(float, float, strsm_)
EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
#endif
} // end namespace internal

View File

@ -55,6 +55,7 @@
#endif
#if defined __NVCC__
#pragma diag_suppress boolean_controlling_expr_is_constant
// Disable the "statement is unreachable" message
#pragma diag_suppress code_is_unreachable
// Disable the "dynamic initialization in unreachable code" message

View File

@ -49,10 +49,11 @@
#define EIGEN_USE_LAPACKE
#endif
#if defined(EIGEN_USE_MKL_VML)
#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
#define EIGEN_USE_MKL
#endif
#if defined EIGEN_USE_MKL
# include <mkl.h>
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
@ -108,6 +109,10 @@
#endif
#endif
#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
#include "../../misc/blas.h"
#endif
namespace Eigen {
typedef std::complex<double> dcomplex;
@ -121,8 +126,5 @@ typedef int BlasIndex;
} // end namespace Eigen
#if defined(EIGEN_USE_BLAS)
#include "../../misc/blas.h"
#endif
#endif // EIGEN_MKL_SUPPORT_H

View File

@ -413,7 +413,7 @@
// Does the compiler support variadic templates?
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
@ -427,9 +427,9 @@
// Does the compiler fully support const expressions? (as in c++14)
#ifndef EIGEN_HAS_CONSTEXPR
#if defined(__CUDACC__)
#if defined(EIGEN_CUDACC)
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
#define EIGEN_HAS_CONSTEXPR 1
#endif
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
@ -669,7 +669,7 @@ namespace Eigen {
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
*/
#if (defined __CUDACC__)
#if (defined EIGEN_CUDACC)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
@ -837,7 +837,8 @@ namespace Eigen {
// just an empty macro !
#define EIGEN_EMPTY
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =;
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@ -990,7 +991,7 @@ namespace Eigen {
# define EIGEN_TRY try
# define EIGEN_CATCH(X) catch (X)
#else
# ifdef __CUDA_ARCH__
# ifdef EIGEN_CUDA_ARCH
# define EIGEN_THROW_X(X) asm("trap;")
# define EIGEN_THROW asm("trap;")
# else

View File

@ -11,7 +11,7 @@
#ifndef EIGEN_META_H
#define EIGEN_META_H
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
#include <cfloat>
#include <math_constants.h>
#endif
@ -169,7 +169,7 @@ template<bool Condition, typename T=void> struct enable_if;
template<typename T> struct enable_if<true,T>
{ typedef T type; };
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
#if !defined(__FLT_EPSILON__)
#define __FLT_EPSILON__ FLT_EPSILON
#define __DBL_EPSILON__ DBL_EPSILON
@ -523,13 +523,13 @@ template<typename T, typename U> struct scalar_product_traits
namespace numext {
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
#else
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
#endif
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
using internal::device::numeric_limits;
#else
using std::numeric_limits;

View File

@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const Quaterni
if (n != Scalar(0))
{
m_angle = Scalar(2)*atan2(n, abs(q.w()));
if(q.w() < 0)
if(q.w() < Scalar(0))
n = -n;
m_axis = q.vec() / n;
}

View File

@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
typedef typename internal::traits<Derived>::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename internal::traits<Derived>::Coefficients Coefficients;
typedef typename Coefficients::CoeffReturnType CoeffReturnType;
typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
Scalar&, CoeffReturnType>::type NonConstCoeffReturnType;
enum {
Flags = Eigen::internal::traits<Derived>::Flags
};
@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase<Derived, 3>
/** \returns the \c x coefficient */
EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); }
EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
/** \returns the \c y coefficient */
EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); }
EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
/** \returns the \c z coefficient */
EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); }
EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
/** \returns the \c w coefficient */
EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); }
EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
/** \returns a reference to the \c x coefficient */
EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
/** \returns a reference to the \c y coefficient */
EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
/** \returns a reference to the \c z coefficient */
EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
/** \returns a reference to the \c w coefficient */
EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
/** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
/** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
/** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
/** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
/** \returns a read-only vector expression of the imaginary part (x,y,z) */
EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }

View File

@ -37,17 +37,20 @@ template<typename Scalar> class JacobiRotation
typedef typename NumTraits<Scalar>::Real RealScalar;
/** Default constructor without any initialization. */
EIGEN_DEVICE_FUNC
JacobiRotation() {}
/** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
EIGEN_DEVICE_FUNC
JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
Scalar& c() { return m_c; }
Scalar c() const { return m_c; }
Scalar& s() { return m_s; }
Scalar s() const { return m_s; }
EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
/** Concatenates two planar rotation */
EIGEN_DEVICE_FUNC
JacobiRotation operator*(const JacobiRotation& other)
{
using numext::conj;
@ -56,19 +59,26 @@ template<typename Scalar> class JacobiRotation
}
/** Returns the transposed transformation */
EIGEN_DEVICE_FUNC
JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
/** Returns the adjoint transformation */
EIGEN_DEVICE_FUNC
JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
template<typename Derived>
EIGEN_DEVICE_FUNC
bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
EIGEN_DEVICE_FUNC
bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
EIGEN_DEVICE_FUNC
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
protected:
EIGEN_DEVICE_FUNC
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
EIGEN_DEVICE_FUNC
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
Scalar m_c, m_s;
@ -264,6 +274,7 @@ namespace internal {
* \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
*/
template<typename VectorX, typename VectorY, typename OtherScalar>
EIGEN_DEVICE_FUNC
void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
}

View File

@ -1211,7 +1211,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
#endif
}//end deflation
#ifndef __CUDACC__
#ifndef EIGEN_CUDACC
/** \svd_module
*
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm

View File

@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
// eval without temporary
dst.resize(src.rows(), src.cols());
dst.setZero();
dst.reserve((std::max)(src.rows(),src.cols())*2);
dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
for (Index j=0; j<outerEvaluationSize; ++j)
{
dst.startVec(j);
@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
DstXprType temp(src.rows(), src.cols());
temp.reserve((std::max)(src.rows(),src.cols())*2);
temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
for (Index j=0; j<outerEvaluationSize; ++j)
{
temp.startVec(j);

View File

@ -327,7 +327,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
m_isEtreeOk = true;
m_R.resize(m, n);
m_R.resize(diagSize, n);
m_Q.resize(m, diagSize);
// Allocate space for nonzero elements : rough estimation

View File

@ -120,6 +120,8 @@ run time. However, these assertions do cost time and can thus be turned off.
- \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
- \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only,
and never called from device code.
- \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.

View File

@ -261,6 +261,8 @@ x.setIdentity();
Vector3f::UnitX() // 1 0 0
Vector3f::UnitY() // 0 1 0
Vector3f::UnitZ() // 0 0 1
Vector4f::Unit(i)
x.setUnit(i);
\endcode
</td>
<td>
@ -278,6 +280,7 @@ N/A
VectorXf::Unit(size,i)
x.setUnit(size,i);
VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
== Vector4f::UnitY()
\endcode
@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
</tr>
</table>
Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes.
For instance:
\code
MatrixXi M(3,3);
M.setIdentity();
\endcode
\subsection QuickRef_Map Mapping external arrays

View File

@ -63,6 +63,8 @@ In addition you can choose which parts will be substituted by defining one or mu
<tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
</table>
The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined.
Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.

View File

@ -3,18 +3,16 @@ namespace Eigen {
/** \page TopicCUDA Using Eigen in CUDA kernels
\b Disclaimer: this page is about an \b experimental feature in %Eigen.
Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost).
A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels.
To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC.
Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only.
However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
Known issues:
- \c nvcc with MS Visual Studio does not work (patch welcome)
- \c nvcc with \c clang does not work (patch welcome)
- \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
\code
// workaround issue between gcc >= 4.7 and cuda 5.5

View File

@ -0,0 +1,7 @@
Matrix4i A;
A << 1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16;
std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl;

View File

@ -104,7 +104,8 @@ void test_bdcsvd()
CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
// Check that preallocation avoids subsequent mallocs
CALL_SUBTEST_9( svd_preallocate<void>() );
// Disbaled because not supported by BDCSVD
// CALL_SUBTEST_9( svd_preallocate<void>() );
CALL_SUBTEST_2( svd_underoverflow<void>() );
}

View File

@ -20,7 +20,7 @@
#include <math_constants.h>
#include <cuda.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -231,6 +231,19 @@ template<typename Scalar> void mapQuaternion(void){
VERIFY_IS_APPROX(mq3*mq2, q3*q2);
VERIFY_IS_APPROX(mcq1*mq2, q1*q2);
VERIFY_IS_APPROX(mcq3*mq2, q3*q2);
// Bug 1461, compilation issue with Map<const Quat>::w(), and other reference/constness checks:
VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum());
VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum());
mq3.w() = 1;
const Quaternionx& cq3(q3);
VERIFY( &cq3.x() == &q3.x() );
const MQuaternionUA& cmq3(mq3);
VERIFY( &cmq3.x() == &mq3.x() );
// FIXME the following should be ok. The problem is that currently the LValueBit flag
// is used to determine wether we can return a coeff by reference or not, which is not enough for Map<const ...>.
//const MCQuaternionUA& cmcq3(mcq3);
//VERIFY( &cmcq3.x() == &mcq3.x() );
}
template<typename Scalar> void quaternionAlignment(void){

View File

@ -58,7 +58,7 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
MatrixType m = MatrixType::Random(rows,cols);
Scalar s1 = internal::random<Scalar>();
Index arraysize = 2*(rows+4)*(cols+4);
Index arraysize = 4*(rows+4)*(cols+4);
Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
Scalar* array1 = a_array1;
@ -143,9 +143,62 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
VERIFY_IS_APPROX(map,s1*m);
}
// test inner stride and no outer stride
for(int k=0; k<2; ++k)
{
if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2)
break;
Scalar* array = (k==0 ? array1 : array2);
Map<MatrixType, Alignment, InnerStride<Dynamic> > map(array, rows, cols, InnerStride<Dynamic>(2));
map = m;
VERIFY(map.outerStride() == map.innerSize()*2);
for(int i = 0; i < m.outerSize(); ++i)
for(int j = 0; j < m.innerSize(); ++j)
{
VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j));
VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
}
VERIFY_IS_APPROX(s1*map,s1*m);
map *= s1;
VERIFY_IS_APPROX(map,s1*m);
}
internal::aligned_delete(a_array1, arraysize+1);
}
// Additional tests for inner-stride but no outer-stride
template<int>
void bug1453()
{
const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
typedef Matrix<int,2,3,ColMajor> ColMatrix23i;
typedef Matrix<int,3,2,ColMajor> ColMatrix32i;
typedef Matrix<int,2,3,RowMajor> RowMatrix23i;
typedef Matrix<int,3,2,RowMajor> RowMatrix32i;
VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
}
void test_mapstride()
{
for(int i = 0; i < g_repeat; i++) {
@ -175,6 +228,8 @@ void test_mapstride()
CALL_SUBTEST_5( map_class_matrix<Unaligned>(MatrixXi(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
CALL_SUBTEST_6( map_class_matrix<Aligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
CALL_SUBTEST_6( map_class_matrix<Unaligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
CALL_SUBTEST_5( bug1453<0>() );
TEST_SET_BUT_UNUSED_VARIABLE(maxn);
}

View File

@ -15,6 +15,10 @@ bool check_is_convertible(const From&, const To&)
return internal::is_convertible<From,To>::value;
}
struct FooReturnType {
typedef int ReturnType;
};
void test_meta()
{
VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
@ -75,6 +79,11 @@ void test_meta()
VERIFY((!check_is_convertible(A*B, f) ));
VERIFY(( check_is_convertible(A*B, A) ));
}
VERIFY(( internal::has_ReturnType<FooReturnType>::value ));
VERIFY(( internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
VERIFY(( !internal::has_ReturnType<int>::value ));
VERIFY(internal::meta_sqrt<1>::ret == 1);
#define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))

View File

@ -191,6 +191,24 @@ void testVectorType(const VectorType& base)
}
}
}
// test setUnit()
if(m.size()>0)
{
for(Index k=0; k<10; ++k)
{
Index i = internal::random<Index>(0,m.size()-1);
m.setUnit(i);
VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) );
}
if(VectorType::SizeAtCompileTime==Dynamic)
{
Index i = internal::random<Index>(0,2*m.size()-1);
m.setUnit(2*m.size(),i);
VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) );
}
}
}
template<typename MatrixType>

View File

@ -9,6 +9,8 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define TEST_ENABLE_TEMPORARY_TRACKING
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
// ^^ see bug 1449
#include "main.h"

View File

@ -12,7 +12,7 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
namespace Eigen {
@ -1382,5 +1382,5 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
} // end namespace Eigen
#endif // EIGEN_USE_GPU and __CUDACC__
#endif // EIGEN_USE_GPU and EIGEN_CUDACC
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H

View File

@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int StaticKernelSize>
struct GetKernelSize {

View File

@ -211,7 +211,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
#ifndef EIGEN_CUDA_ARCH
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
@ -239,7 +239,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef __CUDA_ARCH__
#ifndef EIGEN_CUDA_ARCH
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
@ -265,7 +265,7 @@ struct GpuDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
cudaError_t err = cudaStreamSynchronize(stream_->stream());
if (err != cudaSuccess) {
std::cerr << "Error detected in CUDA stream: "
@ -304,7 +304,7 @@ struct GpuDevice {
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
cudaError_t error = cudaStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady);
#else
@ -323,9 +323,9 @@ struct GpuDevice {
// FIXME: Should be device and kernel specific.
#ifdef __CUDACC__
#ifdef EIGEN_CUDACC
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
#ifndef __CUDA_ARCH__
#ifndef EIGEN_CUDA_ARCH
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess);

View File

@ -35,7 +35,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
#ifndef __CUDA_ARCH__
#ifndef EIGEN_CUDA_ARCH
// Running on the host CPU
return 1;
#else
@ -45,7 +45,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
// Running on the host CPU
return l1CacheSize();
#else
@ -55,7 +55,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
// Running single threaded on the host CPU
return l3CacheSize();
#else
@ -65,13 +65,13 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef __CUDA_ARCH__
#ifndef EIGEN_CUDA_ARCH
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
#else
// Running on a CUDA device
return __CUDA_ARCH__ / 100;
return EIGEN_CUDA_ARCH / 100;
#endif
}
};

View File

@ -131,7 +131,7 @@ T loadConstant(const T* address) {
return *address;
}
// Use the texture cache on CUDA devices whenever possible
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float loadConstant(const float* address) {
return __ldg(address);

View File

@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
};
#if defined(__CUDACC__)
#if defined(EIGEN_CUDACC)
template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
@ -264,7 +264,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
evaluator.cleanup();
}
#endif // __CUDACC__
#endif // EIGEN_CUDACC
#endif // EIGEN_USE_GPU
// SYCL Executor policy

View File

@ -35,7 +35,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
return __clz(val);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val);
@ -53,7 +53,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
{
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
return __clzll(val);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val);
@ -90,7 +90,7 @@ namespace {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
return __umulhi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
@ -101,7 +101,7 @@ namespace {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_CUDA_ARCH)
return __umul64hi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
@ -124,7 +124,7 @@ namespace {
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
return (__umulhi(magic, n) >> shift);
#elif defined(__SYCL_DEVICE_ONLY__)
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);

View File

@ -27,7 +27,7 @@
*/
// SFINAE requires variadic templates
#ifndef __CUDACC__
#ifndef EIGEN_CUDACC
#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC

View File

@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
};
// For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
template <>
struct PacketType<half, GpuDevice> {
typedef half2 type;

View File

@ -16,7 +16,7 @@ namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#ifdef __CUDA_ARCH__
#ifdef EIGEN_CUDA_ARCH
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
assert(threadIdx.z == 0);

View File

@ -334,7 +334,7 @@ struct OuterReducer {
};
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
@ -694,7 +694,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
@ -781,7 +781,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
Op m_reducer;
// For full reductions
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)

View File

@ -14,7 +14,7 @@ namespace Eigen {
namespace internal {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
// Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple cuda thread to safely accumulate at the same
@ -23,7 +23,7 @@ namespace internal {
// updated the content of the output address it will try again.
template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if __CUDA_ARCH__ >= 300
#if EIGEN_CUDA_ARCH >= 300
if (sizeof(T) == 4)
{
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@ -102,7 +102,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if __CUDA_ARCH__ >= 300
#if EIGEN_CUDA_ARCH >= 300
atomicAdd(output, accum);
#else // __CUDA_ARCH__ >= 300
assert(0 && "Shouldn't be called on unsupported device");
@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if __CUDA_ARCH__ >= 300
#if EIGEN_CUDA_ARCH >= 300
// Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) {
@ -372,7 +372,7 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if __CUDA_ARCH__ >= 300
#if EIGEN_CUDA_ARCH >= 300
typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);

View File

@ -242,7 +242,7 @@ struct ScanLauncher {
}
};
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
// GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in
@ -281,7 +281,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
}
};
#endif // EIGEN_USE_GPU && __CUDACC__
#endif // EIGEN_USE_GPU && EIGEN_CUDACC
} // end namespace Eigen

View File

@ -15,7 +15,7 @@
// The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen {
template <typename T, size_t n> class array {

View File

@ -121,7 +121,7 @@ template <>
struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy;
return ::lgammaf_r(x, &dummy);
#else
@ -134,7 +134,7 @@ template <>
struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy;
return ::lgamma_r(x, &dummy);
#else

View File

@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plgamma<float4>(const float4& a)

View File

@ -12,7 +12,7 @@
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -13,7 +13,7 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -11,7 +11,7 @@
#define EIGEN_TEST_FUNC cxx11_tensor_complex
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -11,7 +11,7 @@
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -14,7 +14,7 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -12,7 +12,7 @@
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -13,7 +13,7 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -13,7 +13,7 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -13,7 +13,7 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -12,7 +12,7 @@
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if dEIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"

View File

@ -13,7 +13,7 @@
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#if EIGEN_CUDACC_VER >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"