mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Merged eigen/eigen into default
This commit is contained in:
commit
1affe3d8df
@ -9,6 +9,7 @@
|
||||
#define EIGEN_CHOLESKY_MODULE_H
|
||||
|
||||
#include "Core"
|
||||
#include "Jacobi"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
@ -31,7 +32,11 @@
|
||||
#include "src/Cholesky/LLT.h"
|
||||
#include "src/Cholesky/LDLT.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/Cholesky/LLT_LAPACKE.h"
|
||||
#endif
|
||||
|
||||
|
32
Eigen/Core
32
Eigen/Core
@ -14,8 +14,24 @@
|
||||
// first thing Eigen does: stop the compiler from committing suicide
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
||||
#define EIGEN_CUDACC __CUDACC__
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
|
||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
#elif defined(__CUDACC_VER__)
|
||||
#define EIGEN_CUDACC_VER __CUDACC_VER__
|
||||
#else
|
||||
#define EIGEN_CUDACC_VER 0
|
||||
#endif
|
||||
|
||||
// Handle NVCC/CUDA/SYCL
|
||||
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
|
||||
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
|
||||
// Do not try asserts on CUDA and SYCL!
|
||||
#ifndef EIGEN_NO_DEBUG
|
||||
#define EIGEN_NO_DEBUG
|
||||
@ -30,7 +46,7 @@
|
||||
#endif
|
||||
|
||||
// All functions callable from CUDA code must be qualified with __device__
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
// Do not try to vectorize on CUDA and SYCL!
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
@ -47,16 +63,20 @@
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
|
||||
#ifdef __NVCC__
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#endif
|
||||
|
||||
// When compiling CUDA device code with NVCC, pull in math functions from the
|
||||
// global namespace. In host mode, and when device doee with clang, use the
|
||||
// std versions.
|
||||
#if defined(__CUDA_ARCH__) && defined(__NVCC__)
|
||||
#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
|
||||
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
|
||||
#else
|
||||
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
|
||||
#endif
|
||||
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
|
||||
#define EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
|
||||
@ -233,10 +253,10 @@
|
||||
#define EIGEN_HAS_FP16_C
|
||||
#endif
|
||||
|
||||
#if defined __CUDACC__
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#include <vector_types.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
@ -45,7 +45,11 @@
|
||||
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
|
||||
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
|
||||
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
|
||||
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
|
||||
|
4
Eigen/LU
4
Eigen/LU
@ -28,7 +28,11 @@
|
||||
#include "src/LU/FullPivLU.h"
|
||||
#include "src/LU/PartialPivLU.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/LU/PartialPivLU_LAPACKE.h"
|
||||
#endif
|
||||
#include "src/LU/Determinant.h"
|
||||
|
4
Eigen/QR
4
Eigen/QR
@ -36,7 +36,11 @@
|
||||
#include "src/QR/ColPivHouseholderQR.h"
|
||||
#include "src/QR/CompleteOrthogonalDecomposition.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/QR/HouseholderQR_LAPACKE.h"
|
||||
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
|
||||
#endif
|
||||
|
@ -37,7 +37,11 @@
|
||||
#include "src/SVD/JacobiSVD.h"
|
||||
#include "src/SVD/BDCSVD.h"
|
||||
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/SVD/JacobiSVD_LAPACKE.h"
|
||||
#endif
|
||||
|
||||
|
@ -248,7 +248,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was succesful,
|
||||
* \c NumericalIssue if the matrix.appears to be negative.
|
||||
* \c NumericalIssue if the factorization failed because of a zero pivot.
|
||||
*/
|
||||
ComputationInfo info() const
|
||||
{
|
||||
|
@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
||||
*
|
||||
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
|
||||
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
|
||||
* The other triangular part won't be read.
|
||||
* The other triangular part won't be read.
|
||||
*
|
||||
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
|
||||
* matrix A such that A = LL^* = U^*U, where L is lower triangular.
|
||||
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
||||
* Example: \include LLT_example.cpp
|
||||
* Output: \verbinclude LLT_example.out
|
||||
*
|
||||
* \b Performance: for best performance, it is recommended to use a column-major storage format
|
||||
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
|
||||
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
|
||||
* step, and rank-updates can be up to 3 times slower.
|
||||
*
|
||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||
*
|
||||
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
|
||||
* Therefore, the strict lower part does not have to store correct values.
|
||||
*
|
||||
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
||||
*/
|
||||
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
||||
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
|
||||
* the strict lower part does not have to store correct values.
|
||||
*/
|
||||
template<typename _MatrixType, int _UpLo> class LLT
|
||||
{
|
||||
public:
|
||||
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
void solveInPlace(MatrixBase<Derived> &bAndX) const;
|
||||
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
|
||||
|
||||
template<typename InputType>
|
||||
LLT& compute(const EigenBase<InputType>& matrix);
|
||||
@ -177,7 +181,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was succesful,
|
||||
* \c NumericalIssue if the matrix.appears to be negative.
|
||||
* \c NumericalIssue if the matrix.appears not to be positive definite.
|
||||
*/
|
||||
ComputationInfo info() const
|
||||
{
|
||||
@ -424,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
const Index size = a.rows();
|
||||
m_matrix.resize(size, size);
|
||||
m_matrix = a.derived();
|
||||
if (!internal::is_same_dense(m_matrix, a.derived()))
|
||||
m_matrix = a.derived();
|
||||
|
||||
// Compute matrix L1 norm = max abs column sum.
|
||||
m_l1_norm = RealScalar(0);
|
||||
@ -484,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
*
|
||||
* This version avoids a copy when the right hand side matrix b is not needed anymore.
|
||||
*
|
||||
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
|
||||
* This function will const_cast it, so constness isn't honored here.
|
||||
*
|
||||
* \sa LLT::solve(), MatrixBase::llt()
|
||||
*/
|
||||
template<typename MatrixType, int _UpLo>
|
||||
template<typename Derived>
|
||||
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
|
||||
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "LLT is not initialized.");
|
||||
eigen_assert(m_matrix.rows()==bAndX.rows());
|
||||
|
@ -861,6 +861,42 @@ template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
||||
{ return Derived::Unit(3); }
|
||||
|
||||
/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
|
||||
*
|
||||
* \param i index of the unique coefficient to be set to 1
|
||||
*
|
||||
* \only_for_vectors
|
||||
*
|
||||
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
eigen_assert(i<size());
|
||||
derived().setZero();
|
||||
derived().coeffRef(i) = Scalar(1);
|
||||
return derived();
|
||||
}
|
||||
|
||||
/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
|
||||
*
|
||||
* \param newSize the new size of the vector
|
||||
* \param i index of the unique coefficient to be set to 1
|
||||
*
|
||||
* \only_for_vectors
|
||||
*
|
||||
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
eigen_assert(i<newSize);
|
||||
derived().resize(newSize);
|
||||
return setUnit(i);
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CWISE_NULLARY_OP_H
|
||||
|
@ -18,18 +18,33 @@ enum {
|
||||
Small = 3
|
||||
};
|
||||
|
||||
// Define the threshold value to fallback from the generic matrix-matrix product
|
||||
// implementation (heavy) to the lightweight coeff-based product one.
|
||||
// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
// in products/GeneralMatrixMatrix.h for more details.
|
||||
// TODO This threshold should also be used in the compile-time selector below.
|
||||
#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
|
||||
// This default value has been obtained on a Haswell architecture.
|
||||
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
|
||||
#endif
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<int Rows, int Cols, int Depth> struct product_type_selector;
|
||||
|
||||
template<int Size, int MaxSize> struct product_size_category
|
||||
{
|
||||
enum { is_large = MaxSize == Dynamic ||
|
||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||
value = is_large ? Large
|
||||
: Size == 1 ? 1
|
||||
: Small
|
||||
enum {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
is_large = MaxSize == Dynamic ||
|
||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||
#else
|
||||
is_large = 0,
|
||||
#endif
|
||||
value = is_large ? Large
|
||||
: Size == 1 ? 1
|
||||
: Small
|
||||
};
|
||||
};
|
||||
|
||||
@ -379,8 +394,6 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
|
||||
*
|
||||
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
|
||||
*/
|
||||
#ifndef __CUDACC__
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
inline const Product<Derived, OtherDerived>
|
||||
@ -412,8 +425,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
return Product<Derived, OtherDerived>(derived(), other.derived());
|
||||
}
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
|
||||
*
|
||||
* The returned product will behave like any other expressions: the coefficients of the product will be
|
||||
|
@ -299,7 +299,7 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
|
||||
/** \internal tries to do cache prefetching of \a addr */
|
||||
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(__LP64__)
|
||||
// 64-bit pointer operand constraint for inlined asm
|
||||
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
||||
@ -526,7 +526,7 @@ inline void palign(PacketType& first, const PacketType& second)
|
||||
***************************************************************************/
|
||||
|
||||
// Eigen+CUDA does not support complexes.
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_CUDACC
|
||||
|
||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
|
@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
|
||||
{
|
||||
typedef traits<PlainObjectType> TraitsBase;
|
||||
enum {
|
||||
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
|
||||
? PlainObjectType::ColsAtCompileTime
|
||||
: PlainObjectType::RowsAtCompileTime,
|
||||
|
||||
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
|
||||
? int(PlainObjectType::InnerStrideAtCompileTime)
|
||||
: int(StrideType::InnerStrideAtCompileTime),
|
||||
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
|
||||
? int(PlainObjectType::OuterStrideAtCompileTime)
|
||||
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
|
||||
? Dynamic
|
||||
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
|
||||
: int(StrideType::OuterStrideAtCompileTime),
|
||||
Alignment = int(MapOptions)&int(AlignedMask),
|
||||
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
||||
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
|
||||
inline Index outerStride() const
|
||||
{
|
||||
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
||||
: IsVectorAtCompileTime ? this->size()
|
||||
: int(Flags)&RowMajorBit ? this->cols()
|
||||
: this->rows();
|
||||
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? internal::traits<Map>::OuterStrideAtCompileTime
|
||||
: IsVectorAtCompileTime ? (this->size() * innerStride())
|
||||
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
|
||||
: (this->rows() * innerStride());
|
||||
}
|
||||
|
||||
/** Constructor in the fixed-size case.
|
||||
|
@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
template<typename T>
|
||||
struct real_impl<std::complex<T> >
|
||||
{
|
||||
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
template<typename T>
|
||||
struct imag_impl<std::complex<T> >
|
||||
{
|
||||
@ -778,7 +778,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isfinite_impl(const T& x)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
return (::isfinite)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isfinite;
|
||||
@ -793,7 +793,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isinf_impl(const T& x)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
return (::isinf)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isinf;
|
||||
@ -808,7 +808,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isnan_impl(const T& x)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
return (::isnan)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isnan;
|
||||
@ -874,7 +874,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
|
||||
|
||||
namespace numext {
|
||||
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||
@ -1088,7 +1088,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
|
||||
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log1p(const float &x) { return ::log1pf(x); }
|
||||
|
||||
@ -1146,7 +1146,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
|
||||
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float floor(const float &x) { return ::floorf(x); }
|
||||
|
||||
@ -1167,7 +1167,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
|
||||
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float ceil(const float &x) { return ::ceilf(x); }
|
||||
|
||||
@ -1225,7 +1225,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log(const float &x) { return ::logf(x); }
|
||||
|
||||
@ -1253,7 +1253,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float abs(const float &x) { return ::fabsf(x); }
|
||||
|
||||
@ -1283,7 +1283,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
|
||||
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float exp(const float &x) { return ::expf(x); }
|
||||
|
||||
@ -1303,7 +1303,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
|
||||
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float expm1(const float &x) { return ::expm1f(x); }
|
||||
|
||||
@ -1323,7 +1323,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
|
||||
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cos(const float &x) { return ::cosf(x); }
|
||||
|
||||
@ -1343,7 +1343,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
|
||||
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sin(const float &x) { return ::sinf(x); }
|
||||
|
||||
@ -1363,7 +1363,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
|
||||
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tan(const float &x) { return ::tanf(x); }
|
||||
|
||||
@ -1378,13 +1378,14 @@ T acos(const T &x) {
|
||||
return acos(x);
|
||||
}
|
||||
|
||||
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T acosh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(acosh);
|
||||
return acosh(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
|
||||
@ -1393,7 +1394,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
|
||||
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float acos(const float &x) { return ::acosf(x); }
|
||||
|
||||
@ -1408,12 +1409,14 @@ T asin(const T &x) {
|
||||
return asin(x);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T asinh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(asinh);
|
||||
return asinh(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
|
||||
@ -1422,7 +1425,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
|
||||
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float asin(const float &x) { return ::asinf(x); }
|
||||
|
||||
@ -1437,12 +1440,14 @@ T atan(const T &x) {
|
||||
return atan(x);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T atanh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(atanh);
|
||||
return atanh(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
|
||||
@ -1451,7 +1456,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
|
||||
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float atan(const float &x) { return ::atanf(x); }
|
||||
|
||||
@ -1472,7 +1477,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
|
||||
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cosh(const float &x) { return ::coshf(x); }
|
||||
|
||||
@ -1492,7 +1497,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
|
||||
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sinh(const float &x) { return ::sinhf(x); }
|
||||
|
||||
@ -1510,12 +1515,12 @@ T tanh(const T &x) {
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
||||
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
||||
#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
|
||||
#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(const float &x) { return ::tanhf(x); }
|
||||
|
||||
@ -1535,7 +1540,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
|
||||
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float fmod(const float& a, const float& b) {
|
||||
|
@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Derived& operator-=(const MatrixBase<OtherDerived>& other);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
operator*(const MatrixBase<OtherDerived> &other) const
|
||||
{ return this->lazyProduct(other); }
|
||||
#else
|
||||
|
||||
template<typename OtherDerived>
|
||||
const Product<Derived,OtherDerived>
|
||||
operator*(const MatrixBase<OtherDerived> &other) const;
|
||||
|
||||
#endif
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
|
||||
Derived& setIdentity();
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& setIdentity(Index rows, Index cols);
|
||||
EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
|
||||
EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
|
||||
|
||||
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
||||
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
||||
@ -305,7 +298,7 @@ template<typename Derived> class MatrixBase
|
||||
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
||||
{ return cwiseNotEqual(other).any(); }
|
||||
|
||||
NoAlias<Derived,Eigen::MatrixBase > noalias();
|
||||
NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
|
||||
|
||||
// TODO forceAlignedAccess is temporarily disabled
|
||||
// Need to find a nicer workaround.
|
||||
@ -437,8 +430,10 @@ template<typename Derived> class MatrixBase
|
||||
///////// Jacobi module /////////
|
||||
|
||||
template<typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
||||
template<typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
||||
|
||||
///////// SparseCore module /////////
|
||||
|
@ -33,6 +33,7 @@ class NoAlias
|
||||
public:
|
||||
typedef typename ExpressionType::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
|
||||
|
||||
template<typename OtherDerived>
|
||||
|
@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
||||
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
|
||||
* \a data pointers.
|
||||
*
|
||||
* Here is an example using strides:
|
||||
* \include Matrix_Map_stride.cpp
|
||||
* Output: \verbinclude Matrix_Map_stride.out
|
||||
*
|
||||
* \see class Map
|
||||
*/
|
||||
//@{
|
||||
|
@ -851,7 +851,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
||||
}
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_CUDACC
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
{
|
||||
@ -895,7 +895,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
||||
}
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_CUDACC
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
{
|
||||
|
@ -16,7 +16,7 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Many std::complex methods such as operator+, operator-, operator* and
|
||||
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
||||
|
@ -140,7 +140,7 @@ struct half : public half_impl::half_base {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
|
||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||
@ -281,7 +281,7 @@ union FP32 {
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
return __float2half(ff);
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
@ -336,7 +336,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
return __half2float(h);
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
@ -370,7 +370,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
|
||||
return (a.x & 0x7fff) == 0x7c00;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return __hisnan(a);
|
||||
#else
|
||||
return (a.x & 0x7fff) > 0x7c00;
|
||||
@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
return result;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
|
||||
return half(numext::expm1(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return half(::hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
||||
return half(::log10f(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
||||
return half(::tanhf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
@ -446,7 +446,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return __hlt(b, a) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
@ -455,7 +455,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return __hlt(a, b) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
||||
return Eigen::half(::expf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
return Eigen::half(::hlog(a));
|
||||
#else
|
||||
return Eigen::half(::logf(float(a)));
|
||||
@ -610,14 +610,14 @@ struct hash<Eigen::half> {
|
||||
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
}
|
||||
#endif
|
||||
|
||||
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(
|
||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||
@ -625,7 +625,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
namespace Eigen {
|
||||
namespace numext {
|
||||
|
||||
|
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float4 plog<float4>(const float4& a)
|
||||
{
|
||||
|
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||
|
||||
@ -196,7 +196,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const float4*)from);
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@ -204,7 +204,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const double2*)from);
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@ -221,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return make_double2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
|
@ -15,7 +15,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// Most of the following operations require arch >= 3.0
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
|
||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||
|
||||
@ -69,7 +69,7 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
#if EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const half2*)from);
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
@ -78,7 +78,7 @@ template<>
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
#if EIGEN_CUDA_ARCH >= 350
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
@ -116,7 +116,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
float f = __half2float(a) + 1.0f;
|
||||
@ -125,7 +125,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -139,7 +139,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -153,7 +153,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hneg2(a);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -165,7 +165,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -179,7 +179,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -225,7 +225,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -235,7 +235,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
@ -247,7 +247,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
@ -259,7 +259,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 plog<half2>(const half2& a) {
|
||||
|
@ -19,7 +19,7 @@ struct scalar_cast_op<float, Eigen::half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef Eigen::half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
return __float2half(a);
|
||||
#else
|
||||
return Eigen::half(a);
|
||||
@ -37,7 +37,7 @@ struct scalar_cast_op<int, Eigen::half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef Eigen::half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
return __float2half(static_cast<float>(a));
|
||||
#else
|
||||
return Eigen::half(static_cast<float>(a));
|
||||
@ -55,7 +55,7 @@ struct scalar_cast_op<Eigen::half, float> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef float result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
return __half2float(a);
|
||||
#else
|
||||
return static_cast<float>(a);
|
||||
@ -69,7 +69,7 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
|
||||
|
||||
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<Eigen::half, float> {
|
||||
|
@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
|
||||
{
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
// FIXME is there some kind of cuda::swap?
|
||||
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
|
||||
#else
|
||||
|
@ -427,7 +427,13 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
template<typename Dst>
|
||||
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
|
||||
// to determine the following heuristic.
|
||||
// EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
|
||||
// unless it has been specialized by the user or for a given architecture.
|
||||
// Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs.
|
||||
// I'm not sure it is still required.
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||
lazyproduct::evalTo(dst, lhs, rhs);
|
||||
else
|
||||
{
|
||||
@ -439,7 +445,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
template<typename Dst>
|
||||
static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||
lazyproduct::addTo(dst, lhs, rhs);
|
||||
else
|
||||
scaleAndAddTo(dst,lhs, rhs, Scalar(1));
|
||||
@ -448,7 +454,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
template<typename Dst>
|
||||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
|
||||
if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
|
||||
lazyproduct::subTo(dst, lhs, rhs);
|
||||
else
|
||||
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
||||
|
@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
|
||||
BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \
|
||||
char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \
|
||||
EIGTYPE beta(1); \
|
||||
BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \
|
||||
BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \
|
||||
} \
|
||||
};
|
||||
|
||||
@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
|
||||
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
|
||||
#else
|
||||
EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
|
||||
EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
|
||||
#endif
|
||||
|
||||
// TODO hanlde complex cases
|
||||
// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
|
||||
|
@ -46,7 +46,7 @@ namespace internal {
|
||||
|
||||
// gemm specialization
|
||||
|
||||
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \
|
||||
#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \
|
||||
template< \
|
||||
typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
@ -100,13 +100,20 @@ static void run(Index rows, Index cols, Index depth, \
|
||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} else b = _rhs; \
|
||||
\
|
||||
BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
}};
|
||||
|
||||
GEMM_SPECIALIZATION(double, d, double, d)
|
||||
GEMM_SPECIALIZATION(float, f, float, s)
|
||||
GEMM_SPECIALIZATION(dcomplex, cd, double, z)
|
||||
GEMM_SPECIALIZATION(scomplex, cf, float, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
GEMM_SPECIALIZATION(double, d, double, dgemm)
|
||||
GEMM_SPECIALIZATION(float, f, float, sgemm)
|
||||
GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
|
||||
GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
|
||||
#else
|
||||
GEMM_SPECIALIZATION(double, d, double, dgemm_)
|
||||
GEMM_SPECIALIZATION(float, f, float, sgemm_)
|
||||
GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
|
||||
GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
|
||||
#endif
|
||||
|
||||
} // end namespase internal
|
||||
|
||||
|
@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
|
||||
|
||||
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \
|
||||
#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \
|
||||
template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
|
||||
struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
|
||||
{ \
|
||||
@ -113,14 +113,21 @@ static void run( \
|
||||
x_ptr=x_tmp.data(); \
|
||||
incx=1; \
|
||||
} else x_ptr=rhs; \
|
||||
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
}\
|
||||
};
|
||||
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv)
|
||||
#else
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
|
||||
EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
|
||||
#endif
|
||||
|
||||
} // end namespase internal
|
||||
|
||||
|
@ -40,7 +40,7 @@ namespace internal {
|
||||
|
||||
/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
|
||||
|
||||
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
@ -81,13 +81,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} else b = _rhs; \
|
||||
\
|
||||
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
\
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
@ -144,20 +144,26 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
|
||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} \
|
||||
\
|
||||
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
\
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_SYMM_L(double, double, d, d)
|
||||
EIGEN_BLAS_SYMM_L(float, float, f, s)
|
||||
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_HEMM_L(scomplex, float, cf, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
|
||||
EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
|
||||
EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
|
||||
EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
|
||||
#else
|
||||
EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
|
||||
EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
|
||||
EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
|
||||
EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
|
||||
#endif
|
||||
|
||||
/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
|
||||
|
||||
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
@ -197,13 +203,13 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} else b = _lhs; \
|
||||
\
|
||||
BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
\
|
||||
} \
|
||||
};
|
||||
|
||||
|
||||
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
@ -259,15 +265,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
|
||||
ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \
|
||||
} \
|
||||
\
|
||||
BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_SYMM_R(double, double, d, d)
|
||||
EIGEN_BLAS_SYMM_R(float, float, f, s)
|
||||
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_HEMM_R(scomplex, float, cf, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
|
||||
EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
|
||||
EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
|
||||
EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
|
||||
#else
|
||||
EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
|
||||
EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
|
||||
EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
|
||||
EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
|
||||
#endif
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
|
||||
x_tmp=map_x.conjugate(); \
|
||||
x_ptr=x_tmp.data(); \
|
||||
} else x_ptr=_rhs; \
|
||||
BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \
|
||||
}\
|
||||
};
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
|
||||
#else
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
|
||||
EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
|
||||
// To work around an "error: member reference base type 'Matrix<...>
|
||||
// (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is
|
||||
// not a structure or union" compilation error in nvcc (tested V8.0.61),
|
||||
// create a dummy internal::constructor_without_unaligned_array_assert
|
||||
// object to pass to the Matrix constructor.
|
||||
internal::constructor_without_unaligned_array_assert a;
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a);
|
||||
triangularBuffer.setZero();
|
||||
if((Mode&ZeroDiag)==ZeroDiag)
|
||||
triangularBuffer.diagonal().setZero();
|
||||
@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
|
||||
internal::constructor_without_unaligned_array_assert a;
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a);
|
||||
triangularBuffer.setZero();
|
||||
if((Mode&ZeroDiag)==ZeroDiag)
|
||||
triangularBuffer.diagonal().setZero();
|
||||
|
@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
|
||||
EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
|
||||
|
||||
// implements col-major += alpha * op(triangular) * op(general)
|
||||
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, int Mode, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
||||
} \
|
||||
/*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
||||
/* call ?trmm*/ \
|
||||
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
\
|
||||
/* Add op(a_triangular)*b into res*/ \
|
||||
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
||||
@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMM_L(double, double, d, d)
|
||||
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMM_L(float, float, f, s)
|
||||
EIGEN_BLAS_TRMM_L(scomplex, float, cf, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
|
||||
EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
|
||||
EIGEN_BLAS_TRMM_L(float, float, f, strmm)
|
||||
EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
|
||||
#else
|
||||
EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
|
||||
EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
|
||||
EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
|
||||
EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
|
||||
#endif
|
||||
|
||||
// implements col-major += alpha * op(general) * op(triangular)
|
||||
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \
|
||||
template <typename Index, int Mode, \
|
||||
int LhsStorageOrder, bool ConjugateLhs, \
|
||||
int RhsStorageOrder, bool ConjugateRhs> \
|
||||
@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
||||
} \
|
||||
/*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \
|
||||
/* call ?trmm*/ \
|
||||
BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \
|
||||
\
|
||||
/* Add op(a_triangular)*b into res*/ \
|
||||
Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
|
||||
@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMM_R(double, double, d, d)
|
||||
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMM_R(float, float, f, s)
|
||||
EIGEN_BLAS_TRMM_R(scomplex, float, cf, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
|
||||
EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
|
||||
EIGEN_BLAS_TRMM_R(float, float, f, strmm)
|
||||
EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
|
||||
#else
|
||||
EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
|
||||
EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
|
||||
EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
|
||||
EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
|
||||
#endif
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
|
||||
EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
|
||||
|
||||
// implements col-major: res += alpha * op(triangular) * vector
|
||||
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
|
||||
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
||||
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
|
||||
enum { \
|
||||
@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
||||
diag = IsUnitDiag ? 'U' : 'N'; \
|
||||
\
|
||||
/* call ?TRMV*/ \
|
||||
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
\
|
||||
/* Add op(a_tr)rhs into res*/ \
|
||||
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
||||
if (size<(std::max)(rows,cols)) { \
|
||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||
@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
||||
m = convert_index<BlasIndex>(size); \
|
||||
n = convert_index<BlasIndex>(cols-size); \
|
||||
} \
|
||||
BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMV_CM(double, double, d, d)
|
||||
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMV_CM(float, float, f, s)
|
||||
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMV_CM(double, double, d, d,)
|
||||
EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,)
|
||||
EIGEN_BLAS_TRMV_CM(float, float, f, s,)
|
||||
EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,)
|
||||
#else
|
||||
EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
|
||||
EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
|
||||
EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
|
||||
EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
|
||||
#endif
|
||||
|
||||
// implements row-major: res += alpha * op(triangular) * vector
|
||||
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \
|
||||
template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
|
||||
struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
|
||||
enum { \
|
||||
@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
||||
diag = IsUnitDiag ? 'U' : 'N'; \
|
||||
\
|
||||
/* call ?TRMV*/ \
|
||||
BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
|
||||
\
|
||||
/* Add op(a_tr)rhs into res*/ \
|
||||
BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
|
||||
/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
|
||||
if (size<(std::max)(rows,cols)) { \
|
||||
if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
|
||||
@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
|
||||
m = convert_index<BlasIndex>(size); \
|
||||
n = convert_index<BlasIndex>(cols-size); \
|
||||
} \
|
||||
BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \
|
||||
} \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRMV_RM(double, double, d, d)
|
||||
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z)
|
||||
EIGEN_BLAS_TRMV_RM(float, float, f, s)
|
||||
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRMV_RM(double, double, d, d,)
|
||||
EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,)
|
||||
EIGEN_BLAS_TRMV_RM(float, float, f, s,)
|
||||
EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,)
|
||||
#else
|
||||
EIGEN_BLAS_TRMV_RM(double, double, d, d,_)
|
||||
EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_)
|
||||
EIGEN_BLAS_TRMV_RM(float, float, f, s,_)
|
||||
EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_)
|
||||
#endif
|
||||
|
||||
} // end namespase internal
|
||||
|
||||
|
@ -38,7 +38,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// implements LeftSide op(triangular)^-1 * general
|
||||
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||
{ \
|
||||
@ -80,18 +80,24 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
|
||||
} \
|
||||
if (IsUnitDiag) diag='U'; \
|
||||
/* call ?trsm*/ \
|
||||
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRSM_L(double, double, d)
|
||||
EIGEN_BLAS_TRSM_L(dcomplex, double, z)
|
||||
EIGEN_BLAS_TRSM_L(float, float, s)
|
||||
EIGEN_BLAS_TRSM_L(scomplex, float, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRSM_L(double, double, dtrsm)
|
||||
EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
|
||||
EIGEN_BLAS_TRSM_L(float, float, strsm)
|
||||
EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
|
||||
#else
|
||||
EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
|
||||
EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
|
||||
EIGEN_BLAS_TRSM_L(float, float, strsm_)
|
||||
EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
|
||||
#endif
|
||||
|
||||
// implements RightSide general * op(triangular)^-1
|
||||
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \
|
||||
#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
|
||||
template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
|
||||
struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
|
||||
{ \
|
||||
@ -133,16 +139,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
|
||||
} \
|
||||
if (IsUnitDiag) diag='U'; \
|
||||
/* call ?trsm*/ \
|
||||
BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \
|
||||
/*std::cout << "TRMS_L specialization!\n";*/ \
|
||||
} \
|
||||
};
|
||||
|
||||
EIGEN_BLAS_TRSM_R(double, double, d)
|
||||
EIGEN_BLAS_TRSM_R(dcomplex, double, z)
|
||||
EIGEN_BLAS_TRSM_R(float, float, s)
|
||||
EIGEN_BLAS_TRSM_R(scomplex, float, c)
|
||||
|
||||
#ifdef EIGEN_USE_MKL
|
||||
EIGEN_BLAS_TRSM_R(double, double, dtrsm)
|
||||
EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
|
||||
EIGEN_BLAS_TRSM_R(float, float, strsm)
|
||||
EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
|
||||
#else
|
||||
EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
|
||||
EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
|
||||
EIGEN_BLAS_TRSM_R(float, float, strsm_)
|
||||
EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
@ -55,6 +55,7 @@
|
||||
#endif
|
||||
|
||||
#if defined __NVCC__
|
||||
#pragma diag_suppress boolean_controlling_expr_is_constant
|
||||
// Disable the "statement is unreachable" message
|
||||
#pragma diag_suppress code_is_unreachable
|
||||
// Disable the "dynamic initialization in unreachable code" message
|
||||
|
@ -49,10 +49,11 @@
|
||||
#define EIGEN_USE_LAPACKE
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_MKL_VML)
|
||||
#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
|
||||
#define EIGEN_USE_MKL
|
||||
#endif
|
||||
|
||||
|
||||
#if defined EIGEN_USE_MKL
|
||||
# include <mkl.h>
|
||||
/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
|
||||
@ -108,6 +109,10 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
|
||||
#include "../../misc/blas.h"
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
typedef std::complex<double> dcomplex;
|
||||
@ -121,8 +126,5 @@ typedef int BlasIndex;
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#if defined(EIGEN_USE_BLAS)
|
||||
#include "../../misc/blas.h"
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_MKL_SUPPORT_H
|
||||
|
@ -413,7 +413,7 @@
|
||||
// Does the compiler support variadic templates?
|
||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) )
|
||||
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
|
||||
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
||||
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||
@ -427,9 +427,9 @@
|
||||
// Does the compiler fully support const expressions? (as in c++14)
|
||||
#ifndef EIGEN_HAS_CONSTEXPR
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#if defined(EIGEN_CUDACC)
|
||||
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
|
||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500))
|
||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
|
||||
#define EIGEN_HAS_CONSTEXPR 1
|
||||
#endif
|
||||
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
||||
@ -669,7 +669,7 @@ namespace Eigen {
|
||||
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
|
||||
* vectorized and non-vectorized code.
|
||||
*/
|
||||
#if (defined __CUDACC__)
|
||||
#if (defined EIGEN_CUDACC)
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
|
||||
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
||||
@ -837,7 +837,8 @@ namespace Eigen {
|
||||
// just an empty macro !
|
||||
#define EIGEN_EMPTY
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
|
||||
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||
using Base::operator =;
|
||||
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
||||
@ -990,7 +991,7 @@ namespace Eigen {
|
||||
# define EIGEN_TRY try
|
||||
# define EIGEN_CATCH(X) catch (X)
|
||||
#else
|
||||
# ifdef __CUDA_ARCH__
|
||||
# ifdef EIGEN_CUDA_ARCH
|
||||
# define EIGEN_THROW_X(X) asm("trap;")
|
||||
# define EIGEN_THROW asm("trap;")
|
||||
# else
|
||||
|
@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_META_H
|
||||
#define EIGEN_META_H
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#include <cfloat>
|
||||
#include <math_constants.h>
|
||||
#endif
|
||||
@ -169,7 +169,7 @@ template<bool Condition, typename T=void> struct enable_if;
|
||||
template<typename T> struct enable_if<true,T>
|
||||
{ typedef T type; };
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if !defined(__FLT_EPSILON__)
|
||||
#define __FLT_EPSILON__ FLT_EPSILON
|
||||
#define __DBL_EPSILON__ DBL_EPSILON
|
||||
@ -523,13 +523,13 @@ template<typename T, typename U> struct scalar_product_traits
|
||||
|
||||
namespace numext {
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
||||
#else
|
||||
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
using internal::device::numeric_limits;
|
||||
#else
|
||||
using std::numeric_limits;
|
||||
|
@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const Quaterni
|
||||
if (n != Scalar(0))
|
||||
{
|
||||
m_angle = Scalar(2)*atan2(n, abs(q.w()));
|
||||
if(q.w() < 0)
|
||||
if(q.w() < Scalar(0))
|
||||
n = -n;
|
||||
m_axis = q.vec() / n;
|
||||
}
|
||||
|
@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
|
||||
typedef typename internal::traits<Derived>::Scalar Scalar;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
typedef typename internal::traits<Derived>::Coefficients Coefficients;
|
||||
typedef typename Coefficients::CoeffReturnType CoeffReturnType;
|
||||
typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
|
||||
Scalar&, CoeffReturnType>::type NonConstCoeffReturnType;
|
||||
|
||||
|
||||
enum {
|
||||
Flags = Eigen::internal::traits<Derived>::Flags
|
||||
};
|
||||
@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase<Derived, 3>
|
||||
|
||||
|
||||
/** \returns the \c x coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); }
|
||||
EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
|
||||
/** \returns the \c y coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); }
|
||||
EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
|
||||
/** \returns the \c z coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); }
|
||||
EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
|
||||
/** \returns the \c w coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); }
|
||||
EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
|
||||
|
||||
/** \returns a reference to the \c x coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
|
||||
/** \returns a reference to the \c y coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
|
||||
/** \returns a reference to the \c z coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
|
||||
/** \returns a reference to the \c w coefficient */
|
||||
EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
|
||||
/** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
|
||||
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
|
||||
/** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
|
||||
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
|
||||
/** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
|
||||
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
|
||||
/** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
|
||||
EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
|
||||
|
||||
/** \returns a read-only vector expression of the imaginary part (x,y,z) */
|
||||
EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
|
||||
|
@ -37,17 +37,20 @@ template<typename Scalar> class JacobiRotation
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
|
||||
/** Default constructor without any initialization. */
|
||||
EIGEN_DEVICE_FUNC
|
||||
JacobiRotation() {}
|
||||
|
||||
/** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
|
||||
EIGEN_DEVICE_FUNC
|
||||
JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
|
||||
|
||||
Scalar& c() { return m_c; }
|
||||
Scalar c() const { return m_c; }
|
||||
Scalar& s() { return m_s; }
|
||||
Scalar s() const { return m_s; }
|
||||
EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
|
||||
EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
|
||||
EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
|
||||
EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
|
||||
|
||||
/** Concatenates two planar rotation */
|
||||
EIGEN_DEVICE_FUNC
|
||||
JacobiRotation operator*(const JacobiRotation& other)
|
||||
{
|
||||
using numext::conj;
|
||||
@ -56,19 +59,26 @@ template<typename Scalar> class JacobiRotation
|
||||
}
|
||||
|
||||
/** Returns the transposed transformation */
|
||||
EIGEN_DEVICE_FUNC
|
||||
JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
|
||||
|
||||
/** Returns the adjoint transformation */
|
||||
EIGEN_DEVICE_FUNC
|
||||
JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
|
||||
EIGEN_DEVICE_FUNC
|
||||
bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
|
||||
|
||||
Scalar m_c, m_s;
|
||||
@ -264,6 +274,7 @@ namespace internal {
|
||||
* \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
|
||||
*/
|
||||
template<typename VectorX, typename VectorY, typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
|
||||
}
|
||||
|
||||
|
@ -1211,7 +1211,7 @@ void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index
|
||||
#endif
|
||||
}//end deflation
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_CUDACC
|
||||
/** \svd_module
|
||||
*
|
||||
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
|
||||
|
@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
|
||||
// eval without temporary
|
||||
dst.resize(src.rows(), src.cols());
|
||||
dst.setZero();
|
||||
dst.reserve((std::max)(src.rows(),src.cols())*2);
|
||||
dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
|
||||
for (Index j=0; j<outerEvaluationSize; ++j)
|
||||
{
|
||||
dst.startVec(j);
|
||||
@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
|
||||
|
||||
DstXprType temp(src.rows(), src.cols());
|
||||
|
||||
temp.reserve((std::max)(src.rows(),src.cols())*2);
|
||||
temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
|
||||
for (Index j=0; j<outerEvaluationSize; ++j)
|
||||
{
|
||||
temp.startVec(j);
|
||||
|
@ -327,7 +327,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
|
||||
internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
|
||||
m_isEtreeOk = true;
|
||||
|
||||
m_R.resize(m, n);
|
||||
m_R.resize(diagSize, n);
|
||||
m_Q.resize(m, diagSize);
|
||||
|
||||
// Allocate space for nonzero elements : rough estimation
|
||||
|
@ -120,6 +120,8 @@ run time. However, these assertions do cost time and can thus be turned off.
|
||||
- \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal
|
||||
temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding
|
||||
this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB.
|
||||
- \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only,
|
||||
and never called from device code.
|
||||
|
||||
|
||||
- \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
|
||||
|
@ -261,6 +261,8 @@ x.setIdentity();
|
||||
Vector3f::UnitX() // 1 0 0
|
||||
Vector3f::UnitY() // 0 1 0
|
||||
Vector3f::UnitZ() // 0 0 1
|
||||
Vector4f::Unit(i)
|
||||
x.setUnit(i);
|
||||
\endcode
|
||||
</td>
|
||||
<td>
|
||||
@ -278,6 +280,7 @@ N/A
|
||||
|
||||
|
||||
VectorXf::Unit(size,i)
|
||||
x.setUnit(size,i);
|
||||
VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
|
||||
== Vector4f::UnitY()
|
||||
\endcode
|
||||
@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes.
|
||||
For instance:
|
||||
\code
|
||||
MatrixXi M(3,3);
|
||||
M.setIdentity();
|
||||
\endcode
|
||||
|
||||
\subsection QuickRef_Map Mapping external arrays
|
||||
|
||||
|
@ -63,6 +63,8 @@ In addition you can choose which parts will be substituted by defining one or mu
|
||||
<tr><td>\c EIGEN_USE_MKL_ALL </td><td>Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML </td></tr>
|
||||
</table>
|
||||
|
||||
The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined.
|
||||
|
||||
Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details.
|
||||
|
||||
Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module.
|
||||
|
@ -3,18 +3,16 @@ namespace Eigen {
|
||||
|
||||
/** \page TopicCUDA Using Eigen in CUDA kernels
|
||||
|
||||
\b Disclaimer: this page is about an \b experimental feature in %Eigen.
|
||||
|
||||
Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost).
|
||||
A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels.
|
||||
To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC.
|
||||
Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code.
|
||||
This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header.
|
||||
This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only.
|
||||
However, in both cases, host's SIMD vectorization has to be disabled in .cu files.
|
||||
It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files.
|
||||
|
||||
Known issues:
|
||||
|
||||
- \c nvcc with MS Visual Studio does not work (patch welcome)
|
||||
|
||||
- \c nvcc with \c clang does not work (patch welcome)
|
||||
|
||||
- \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \<limits\> header file. To workaround this, you can add the following before including any other files:
|
||||
\code
|
||||
// workaround issue between gcc >= 4.7 and cuda 5.5
|
||||
|
7
doc/snippets/Matrix_Map_stride.cpp
Normal file
7
doc/snippets/Matrix_Map_stride.cpp
Normal file
@ -0,0 +1,7 @@
|
||||
Matrix4i A;
|
||||
A << 1, 2, 3, 4,
|
||||
5, 6, 7, 8,
|
||||
9, 10, 11, 12,
|
||||
13, 14, 15, 16;
|
||||
|
||||
std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl;
|
@ -104,7 +104,8 @@ void test_bdcsvd()
|
||||
CALL_SUBTEST_7( BDCSVD<MatrixXf>(10,10) );
|
||||
|
||||
// Check that preallocation avoids subsequent mallocs
|
||||
CALL_SUBTEST_9( svd_preallocate<void>() );
|
||||
// Disbaled because not supported by BDCSVD
|
||||
// CALL_SUBTEST_9( svd_preallocate<void>() );
|
||||
|
||||
CALL_SUBTEST_2( svd_underoverflow<void>() );
|
||||
}
|
||||
|
@ -20,7 +20,7 @@
|
||||
|
||||
#include <math_constants.h>
|
||||
#include <cuda.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -231,6 +231,19 @@ template<typename Scalar> void mapQuaternion(void){
|
||||
VERIFY_IS_APPROX(mq3*mq2, q3*q2);
|
||||
VERIFY_IS_APPROX(mcq1*mq2, q1*q2);
|
||||
VERIFY_IS_APPROX(mcq3*mq2, q3*q2);
|
||||
|
||||
// Bug 1461, compilation issue with Map<const Quat>::w(), and other reference/constness checks:
|
||||
VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum());
|
||||
VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum());
|
||||
mq3.w() = 1;
|
||||
const Quaternionx& cq3(q3);
|
||||
VERIFY( &cq3.x() == &q3.x() );
|
||||
const MQuaternionUA& cmq3(mq3);
|
||||
VERIFY( &cmq3.x() == &mq3.x() );
|
||||
// FIXME the following should be ok. The problem is that currently the LValueBit flag
|
||||
// is used to determine wether we can return a coeff by reference or not, which is not enough for Map<const ...>.
|
||||
//const MCQuaternionUA& cmcq3(mcq3);
|
||||
//VERIFY( &cmcq3.x() == &mcq3.x() );
|
||||
}
|
||||
|
||||
template<typename Scalar> void quaternionAlignment(void){
|
||||
|
@ -58,7 +58,7 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
|
||||
MatrixType m = MatrixType::Random(rows,cols);
|
||||
Scalar s1 = internal::random<Scalar>();
|
||||
|
||||
Index arraysize = 2*(rows+4)*(cols+4);
|
||||
Index arraysize = 4*(rows+4)*(cols+4);
|
||||
|
||||
Scalar* a_array1 = internal::aligned_new<Scalar>(arraysize+1);
|
||||
Scalar* array1 = a_array1;
|
||||
@ -143,9 +143,62 @@ template<int Alignment,typename MatrixType> void map_class_matrix(const MatrixTy
|
||||
VERIFY_IS_APPROX(map,s1*m);
|
||||
}
|
||||
|
||||
// test inner stride and no outer stride
|
||||
for(int k=0; k<2; ++k)
|
||||
{
|
||||
if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2)
|
||||
break;
|
||||
Scalar* array = (k==0 ? array1 : array2);
|
||||
|
||||
Map<MatrixType, Alignment, InnerStride<Dynamic> > map(array, rows, cols, InnerStride<Dynamic>(2));
|
||||
map = m;
|
||||
VERIFY(map.outerStride() == map.innerSize()*2);
|
||||
for(int i = 0; i < m.outerSize(); ++i)
|
||||
for(int j = 0; j < m.innerSize(); ++j)
|
||||
{
|
||||
VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j));
|
||||
VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
|
||||
}
|
||||
VERIFY_IS_APPROX(s1*map,s1*m);
|
||||
map *= s1;
|
||||
VERIFY_IS_APPROX(map,s1*m);
|
||||
}
|
||||
|
||||
internal::aligned_delete(a_array1, arraysize+1);
|
||||
}
|
||||
|
||||
// Additional tests for inner-stride but no outer-stride
|
||||
template<int>
|
||||
void bug1453()
|
||||
{
|
||||
const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
|
||||
typedef Matrix<int,2,3,ColMajor> ColMatrix23i;
|
||||
typedef Matrix<int,3,2,ColMajor> ColMatrix32i;
|
||||
typedef Matrix<int,2,3,RowMajor> RowMatrix23i;
|
||||
typedef Matrix<int,3,2,RowMajor> RowMatrix32i;
|
||||
|
||||
VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||
VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||
VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||
VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||
|
||||
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||
VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||
|
||||
VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||
VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>()));
|
||||
VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||
VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>()));
|
||||
|
||||
VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||
VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>()));
|
||||
VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||
VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>()));
|
||||
}
|
||||
|
||||
void test_mapstride()
|
||||
{
|
||||
for(int i = 0; i < g_repeat; i++) {
|
||||
@ -175,6 +228,8 @@ void test_mapstride()
|
||||
CALL_SUBTEST_5( map_class_matrix<Unaligned>(MatrixXi(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
||||
CALL_SUBTEST_6( map_class_matrix<Aligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
||||
CALL_SUBTEST_6( map_class_matrix<Unaligned>(MatrixXcd(internal::random<int>(1,maxn),internal::random<int>(1,maxn))) );
|
||||
|
||||
CALL_SUBTEST_5( bug1453<0>() );
|
||||
|
||||
TEST_SET_BUT_UNUSED_VARIABLE(maxn);
|
||||
}
|
||||
|
@ -15,6 +15,10 @@ bool check_is_convertible(const From&, const To&)
|
||||
return internal::is_convertible<From,To>::value;
|
||||
}
|
||||
|
||||
struct FooReturnType {
|
||||
typedef int ReturnType;
|
||||
};
|
||||
|
||||
void test_meta()
|
||||
{
|
||||
VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value));
|
||||
@ -75,6 +79,11 @@ void test_meta()
|
||||
VERIFY((!check_is_convertible(A*B, f) ));
|
||||
VERIFY(( check_is_convertible(A*B, A) ));
|
||||
}
|
||||
|
||||
VERIFY(( internal::has_ReturnType<FooReturnType>::value ));
|
||||
VERIFY(( internal::has_ReturnType<ScalarBinaryOpTraits<int,int> >::value ));
|
||||
VERIFY(( !internal::has_ReturnType<MatrixXf>::value ));
|
||||
VERIFY(( !internal::has_ReturnType<int>::value ));
|
||||
|
||||
VERIFY(internal::meta_sqrt<1>::ret == 1);
|
||||
#define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt<X>::ret == int(std::sqrt(double(X))))
|
||||
|
@ -191,6 +191,24 @@ void testVectorType(const VectorType& base)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// test setUnit()
|
||||
if(m.size()>0)
|
||||
{
|
||||
for(Index k=0; k<10; ++k)
|
||||
{
|
||||
Index i = internal::random<Index>(0,m.size()-1);
|
||||
m.setUnit(i);
|
||||
VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) );
|
||||
}
|
||||
if(VectorType::SizeAtCompileTime==Dynamic)
|
||||
{
|
||||
Index i = internal::random<Index>(0,2*m.size()-1);
|
||||
m.setUnit(2*m.size(),i);
|
||||
VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<typename MatrixType>
|
||||
|
@ -9,6 +9,8 @@
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define TEST_ENABLE_TEMPORARY_TRACKING
|
||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||
// ^^ see bug 1449
|
||||
|
||||
#include "main.h"
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -1382,5 +1382,5 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_USE_GPU and __CUDACC__
|
||||
#endif // EIGEN_USE_GPU and EIGEN_CUDACC
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||
|
@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
|
||||
|
||||
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
|
||||
template <int StaticKernelSize>
|
||||
struct GetKernelSize {
|
||||
|
@ -211,7 +211,7 @@ struct GpuDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
||||
stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
@ -239,7 +239,7 @@ struct GpuDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
@ -265,7 +265,7 @@ struct GpuDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
||||
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
|
||||
cudaError_t err = cudaStreamSynchronize(stream_->stream());
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Error detected in CUDA stream: "
|
||||
@ -304,7 +304,7 @@ struct GpuDevice {
|
||||
// This function checks if the CUDA runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
inline bool ok() const {
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
cudaError_t error = cudaStreamQuery(stream_->stream());
|
||||
return (error == cudaSuccess) || (error == cudaErrorNotReady);
|
||||
#else
|
||||
@ -323,9 +323,9 @@ struct GpuDevice {
|
||||
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
#ifdef __CUDACC__
|
||||
#ifdef EIGEN_CUDACC
|
||||
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
|
||||
EIGEN_UNUSED_VARIABLE(status)
|
||||
assert(status == cudaSuccess);
|
||||
|
@ -35,7 +35,7 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
// Running on the host CPU
|
||||
return 1;
|
||||
#else
|
||||
@ -45,7 +45,7 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Running on the host CPU
|
||||
return l1CacheSize();
|
||||
#else
|
||||
@ -55,7 +55,7 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Running single threaded on the host CPU
|
||||
return l3CacheSize();
|
||||
#else
|
||||
@ -65,13 +65,13 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
// Running single threaded on the host CPU
|
||||
// Should return an enum that encodes the ISA supported by the CPU
|
||||
return 1;
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return __CUDA_ARCH__ / 100;
|
||||
return EIGEN_CUDA_ARCH / 100;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
@ -131,7 +131,7 @@ T loadConstant(const T* address) {
|
||||
return *address;
|
||||
}
|
||||
// Use the texture cache on CUDA devices whenever possible
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float loadConstant(const float* address) {
|
||||
return __ldg(address);
|
||||
|
@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
||||
};
|
||||
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#if defined(EIGEN_CUDACC)
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
struct EigenMetaKernelEval {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
@ -264,7 +264,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
||||
evaluator.cleanup();
|
||||
}
|
||||
|
||||
#endif // __CUDACC__
|
||||
#endif // EIGEN_CUDACC
|
||||
#endif // EIGEN_USE_GPU
|
||||
|
||||
// SYCL Executor policy
|
||||
|
@ -35,7 +35,7 @@ namespace {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
return __clz(val);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::clz(val);
|
||||
@ -53,7 +53,7 @@ namespace {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
return __clzll(val);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::clz(val);
|
||||
@ -90,7 +90,7 @@ namespace {
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return __umulhi(a, b);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
||||
@ -101,7 +101,7 @@ namespace {
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return __umul64hi(a, b);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
||||
@ -124,7 +124,7 @@ namespace {
|
||||
template <typename T>
|
||||
struct DividerHelper<64, T> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
||||
#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
||||
#else
|
||||
const uint64_t shift = 1ULL << log_div;
|
||||
@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
return (__umulhi(magic, n) >> shift);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
||||
|
@ -27,7 +27,7 @@
|
||||
*/
|
||||
|
||||
// SFINAE requires variadic templates
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_CUDACC
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
// SFINAE doesn't work for gcc <= 4.7
|
||||
#ifdef EIGEN_COMP_GNUC
|
||||
|
@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
|
||||
};
|
||||
|
||||
// For CUDA packet types when using a GpuDevice
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
|
||||
template <>
|
||||
struct PacketType<half, GpuDevice> {
|
||||
typedef half2 type;
|
||||
|
@ -16,7 +16,7 @@ namespace internal {
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
// We don't support 3d kernels since we currently only use 1 and
|
||||
// 2d kernels.
|
||||
assert(threadIdx.z == 0);
|
||||
|
@ -334,7 +334,7 @@ struct OuterReducer {
|
||||
};
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
template <int B, int N, typename S, typename R, typename I>
|
||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||
|
||||
@ -694,7 +694,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
||||
#endif
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||
@ -781,7 +781,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
Op m_reducer;
|
||||
|
||||
// For full reductions
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
||||
static const bool RunningOnSycl = false;
|
||||
#elif defined(EIGEN_USE_SYCL)
|
||||
|
@ -14,7 +14,7 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
// Full reducers for GPU, don't vectorize for now
|
||||
|
||||
// Reducer function that enables multiple cuda thread to safely accumulate at the same
|
||||
@ -23,7 +23,7 @@ namespace internal {
|
||||
// updated the content of the output address it will try again.
|
||||
template <typename T, typename R>
|
||||
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
if (sizeof(T) == 4)
|
||||
{
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
@ -102,7 +102,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
atomicAdd(output, accum);
|
||||
#else // __CUDA_ARCH__ >= 300
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
||||
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
// Initialize the output value
|
||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
||||
if (gridDim.x == 1) {
|
||||
@ -372,7 +372,7 @@ template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
typedef typename Self::CoeffReturnType Type;
|
||||
eigen_assert(blockDim.y == 1);
|
||||
eigen_assert(blockDim.z == 1);
|
||||
|
@ -242,7 +242,7 @@ struct ScanLauncher {
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
|
||||
// GPU implementation of scan
|
||||
// TODO(ibab) This placeholder implementation performs multiple scans in
|
||||
@ -281,7 +281,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
|
||||
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_USE_GPU && __CUDACC__
|
||||
#endif // EIGEN_USE_GPU && EIGEN_CUDACC
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
||||
// The array class is only available starting with cxx11. Emulate our own here
|
||||
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
||||
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||
|
||||
namespace Eigen {
|
||||
template <typename T, size_t n> class array {
|
||||
|
@ -121,7 +121,7 @@ template <>
|
||||
struct lgamma_impl<float> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE float run(float x) {
|
||||
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
int dummy;
|
||||
return ::lgammaf_r(x, &dummy);
|
||||
#else
|
||||
@ -134,7 +134,7 @@ template <>
|
||||
struct lgamma_impl<double> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE double run(double x) {
|
||||
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
int dummy;
|
||||
return ::lgamma_r(x, &dummy);
|
||||
#else
|
||||
|
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float4 plgamma<float4>(const float4& a)
|
||||
|
@ -12,7 +12,7 @@
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -13,7 +13,7 @@
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -11,7 +11,7 @@
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -11,7 +11,7 @@
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -14,7 +14,7 @@
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -12,7 +12,7 @@
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -13,7 +13,7 @@
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -13,7 +13,7 @@
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -13,7 +13,7 @@
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -12,7 +12,7 @@
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if dEIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
@ -13,7 +13,7 @@
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#include "main.h"
|
||||
|
Loading…
x
Reference in New Issue
Block a user