This commit is contained in:
Gael Guennebaud 2016-06-14 15:33:47 +02:00
commit 76236cdea4
28 changed files with 376 additions and 156 deletions

View File

@ -32,6 +32,7 @@
* \endcode * \endcode
*/ */
#include "src/misc/RealSvd2x2.h"
#include "src/Eigenvalues/Tridiagonalization.h" #include "src/Eigenvalues/Tridiagonalization.h"
#include "src/Eigenvalues/RealSchur.h" #include "src/Eigenvalues/RealSchur.h"
#include "src/Eigenvalues/EigenSolver.h" #include "src/Eigenvalues/EigenSolver.h"

View File

@ -31,6 +31,7 @@
* \endcode * \endcode
*/ */
#include "src/misc/RealSvd2x2.h"
#include "src/SVD/UpperBidiagonalization.h" #include "src/SVD/UpperBidiagonalization.h"
#include "src/SVD/SVDBase.h" #include "src/SVD/SVDBase.h"
#include "src/SVD/JacobiSVD.h" #include "src/SVD/JacobiSVD.h"

View File

@ -149,7 +149,7 @@ class Array
#if EIGEN_HAS_RVALUE_REFERENCES #if EIGEN_HAS_RVALUE_REFERENCES
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Array(Array&& other) Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
: Base(std::move(other)) : Base(std::move(other))
{ {
Base::_check_template_params(); Base::_check_template_params();
@ -157,7 +157,7 @@ class Array
Base::_set_noalias(other); Base::_set_noalias(other);
} }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Array& operator=(Array&& other) Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
{ {
other.swap(*this); other.swap(*this);
return *this; return *this;

View File

@ -28,6 +28,8 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
AlignedOnScalar = 1, AlignedOnScalar = 1,
size=2, size=2,
HasHalfPacket = 0, HasHalfPacket = 0,
HasAdd = 1,
HasMul = 1,
HasDiv = 1, HasDiv = 1,
HasSqrt = 1, HasSqrt = 1,
HasRsqrt = 1, HasRsqrt = 1,

View File

@ -14,8 +14,9 @@ namespace Eigen {
namespace internal { namespace internal {
static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000); const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000); static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );
static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );
//---------- float ---------- //---------- float ----------
struct Packet2cf struct Packet2cf
@ -274,7 +275,8 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
//---------- double ---------- //---------- double ----------
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000); const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
struct Packet1cd struct Packet1cd
{ {

View File

@ -49,17 +49,6 @@ typedef uint32x4_t Packet4ui;
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
//Special treatment for Apple's llvm-gcc, its NEON packet types are unions
#define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
#else
//Default initializer for packets
#define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
#define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
#endif
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function // arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
// which available on LLVM and GCC (at least) // which available on LLVM and GCC (at least)
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@ -122,12 +111,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{ {
Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); const float32_t f[] = {0, 1, 2, 3};
Packet4f countdown = vld1q_f32(f);
return vaddq_f32(pset1<Packet4f>(a), countdown); return vaddq_f32(pset1<Packet4f>(a), countdown);
} }
template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
{ {
Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); const int32_t i[] = {0, 1, 2, 3};
Packet4i countdown = vld1q_s32(i);
return vaddq_s32(pset1<Packet4i>(a), countdown); return vaddq_s32(pset1<Packet4i>(a), countdown);
} }
@ -585,7 +576,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { r
template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
{ {
Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1); const double countdown_raw[] = {0.0,1.0};
const Packet2d countdown = vld1q_f64(countdown_raw);
return vaddq_f64(pset1<Packet2d>(a), countdown); return vaddq_f64(pset1<Packet2d>(a), countdown);
} }
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }

View File

@ -328,6 +328,30 @@ struct result_of<Func(ArgType0,ArgType1)> {
enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))}; enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type; typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
}; };
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
{typedef typename Func::result_type type;};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
template<typename T>
static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
template<typename T>
static has_tr1_result testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
static has_none testFunctor(...);
// note that the following indirection is needed for gcc-3.3
enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
};
#endif #endif
/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer. /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.

View File

@ -327,33 +327,22 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
} }
else else
{ {
// We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a triangular 2x2 block T // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal 2x2 block T
// From the eigen decomposition of T = U * E * U^-1, // Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00):
// we can extract the eigenvalues of (U^-1 * S * U) / E
// Here, we can take advantage that E = diag(T), and U = [ 1 T_01 ; 0 T_11-T_00], and U^-1 = [1 -T_11/(T_11-T_00) ; 0 1/(T_11-T_00)].
// Then taking beta=T_00*T_11*(T_11-T_00), we can avoid any division, and alpha is the eigenvalues of A = (U^-1 * S * U) * diag(T_11,T_00) * (T_11-T_00):
// T = [a b ; 0 c] // T = [a 0]
// S = [e f ; g h] // [0 b]
RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i, i+1), c = m_realQZ.matrixT().coeff(i+1, i+1); RealScalar a = m_realQZ.matrixT().coeff(i, i), b = m_realQZ.matrixT().coeff(i+1, i+1);
RealScalar e = m_matS.coeff(i, i), f = m_matS.coeff(i, i+1), g = m_matS.coeff(i+1, i), h = m_matS.coeff(i+1, i+1); Matrix<RealScalar,2,2> S2 = m_matS.template block<2,2>(i,i) * Matrix<Scalar,2,1>(b,a).asDiagonal();
RealScalar d = c-a;
RealScalar gb = g*b;
Matrix<RealScalar,2,2> A;
A << (e*d-gb)*c, ((e*b+f*d-h*b)*d-gb*b)*a,
g*c , (gb+h*d)*a;
// NOTE, we could also compute the SVD of T's block during the QZ factorization so that the respective T block is guaranteed to be diagonal, Scalar p = Scalar(0.5) * (S2.coeff(0,0) - S2.coeff(1,1));
// and then we could directly apply the formula below (while taking care of scaling S columns by T11,T00): Scalar z = sqrt(abs(p * p + S2.coeff(1,0) * S2.coeff(0,1)));
m_alphas.coeffRef(i) = ComplexScalar(S2.coeff(1,1) + p, z);
Scalar p = Scalar(0.5) * (A.coeff(i, i) - A.coeff(i+1, i+1)); m_alphas.coeffRef(i+1) = ComplexScalar(S2.coeff(1,1) + p, -z);
Scalar z = sqrt(abs(p * p + A.coeff(i+1, i) * A.coeff(i, i+1)));
m_alphas.coeffRef(i) = ComplexScalar(A.coeff(i+1, i+1) + p, z);
m_alphas.coeffRef(i+1) = ComplexScalar(A.coeff(i+1, i+1) + p, -z);
m_betas.coeffRef(i) = m_betas.coeffRef(i) =
m_betas.coeffRef(i+1) = a*c*d; m_betas.coeffRef(i+1) = a*b;
i += 2; i += 2;
} }
} }

View File

@ -552,7 +552,6 @@ namespace Eigen {
m_T.coeffRef(l,l-1) = Scalar(0.0); m_T.coeffRef(l,l-1) = Scalar(0.0);
} }
template<typename MatrixType> template<typename MatrixType>
RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ) RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
{ {
@ -616,6 +615,37 @@ namespace Eigen {
} }
// check if we converged before reaching iterations limit // check if we converged before reaching iterations limit
m_info = (local_iter<m_maxIters) ? Success : NoConvergence; m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
// For each non triangular 2x2 diagonal block of S,
// reduce the respective 2x2 diagonal block of T to positive diagonal form using 2x2 SVD.
// This step is not mandatory for QZ, but it does help further extraction of eigenvalues/eigenvectors,
// and is in par with Lapack/Matlab QZ.
if(m_info==Success)
{
for(Index i=0; i<dim-1; ++i)
{
if(m_S.coeff(i+1, i) != Scalar(0))
{
JacobiRotation<Scalar> j_left, j_right;
internal::real_2x2_jacobi_svd(m_T, i, i+1, &j_left, &j_right);
// Apply resulting Jacobi rotations
m_S.applyOnTheLeft(i,i+1,j_left);
m_S.applyOnTheRight(i,i+1,j_right);
m_T.applyOnTheLeft(i,i+1,j_left);
m_T.applyOnTheRight(i,i+1,j_right);
m_T(i+1,i) = m_T(i,i+1) = Scalar(0);
if(m_computeQZ) {
m_Q.applyOnTheRight(i,i+1,j_left.transpose());
m_Z.applyOnTheLeft(i,i+1,j_right.transpose());
}
i++;
}
}
}
return *this; return *this;
} // end compute } // end compute

View File

@ -367,10 +367,10 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>() hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
* (conj(h) * matA.col(i).tail(remainingSize))); * (conj(h) * matA.col(i).tail(remainingSize)));
hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1); hCoeffs.tail(n-i-1) += (conj(h)*RealScalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
.rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1); .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), Scalar(-1));
matA.col(i).coeffRef(i+1) = beta; matA.col(i).coeffRef(i+1) = beta;
hCoeffs.coeffRef(i) = h; hCoeffs.coeffRef(i) = h;

View File

@ -1367,7 +1367,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is
EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
Matrix<typename ResultType::Scalar, Dim+1, 1> rhs; Matrix<typename ResultType::Scalar, Dim+1, 1> rhs;
rhs << other,1; rhs.template head<Dim>() = other; rhs[Dim] = typename ResultType::Scalar(1);
Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs); Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
return res.template head<Dim>(); return res.template head<Dim>();
} }

View File

@ -183,7 +183,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
{ {
if(m_isInitialized) // Factorization ran at least once if(m_isInitialized) // Factorization ran at least once
{ {
internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0, internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, internal::convert_index<StorageIndex>(m_size),0, 0, 0, m_perm.data(), 0,
m_iparm.data(), m_msglvl, NULL, NULL); m_iparm.data(), m_msglvl, NULL, NULL);
m_isInitialized = false; m_isInitialized = false;
} }
@ -194,11 +194,11 @@ class PardisoImpl : public SparseSolverBase<Derived>
m_type = type; m_type = type;
bool symmetric = std::abs(m_type) < 10; bool symmetric = std::abs(m_type) < 10;
m_iparm[0] = 1; // No solver default m_iparm[0] = 1; // No solver default
m_iparm[1] = 3; // use Metis for the ordering m_iparm[1] = 2; // use Metis for the ordering
m_iparm[2] = 1; // Numbers of processors, value of OMP_NUM_THREADS m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
m_iparm[3] = 0; // No iterative-direct algorithm m_iparm[3] = 0; // No iterative-direct algorithm
m_iparm[4] = 0; // No user fill-in reducing permutation m_iparm[4] = 0; // No user fill-in reducing permutation
m_iparm[5] = 0; // Write solution into x m_iparm[5] = 0; // Write solution into x, b is left unchanged
m_iparm[6] = 0; // Not in use m_iparm[6] = 0; // Not in use
m_iparm[7] = 2; // Max numbers of iterative refinement steps m_iparm[7] = 2; // Max numbers of iterative refinement steps
m_iparm[8] = 0; // Not in use m_iparm[8] = 0; // Not in use
@ -219,7 +219,8 @@ class PardisoImpl : public SparseSolverBase<Derived>
m_iparm[26] = 0; // No matrix checker m_iparm[26] = 0; // No matrix checker
m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0; m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
m_iparm[34] = 1; // C indexing m_iparm[34] = 1; // C indexing
m_iparm[59] = 1; // Automatic switch between In-Core and Out-of-Core modes m_iparm[36] = 0; // CSR
m_iparm[59] = 0; // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core
memset(m_pt, 0, sizeof(m_pt)); memset(m_pt, 0, sizeof(m_pt));
} }
@ -246,7 +247,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
mutable SparseMatrixType m_matrix; mutable SparseMatrixType m_matrix;
mutable ComputationInfo m_info; mutable ComputationInfo m_info;
bool m_analysisIsOk, m_factorizationIsOk; bool m_analysisIsOk, m_factorizationIsOk;
Index m_type, m_msglvl; StorageIndex m_type, m_msglvl;
mutable void *m_pt[64]; mutable void *m_pt[64];
mutable ParameterType m_iparm; mutable ParameterType m_iparm;
mutable IntColVectorType m_perm; mutable IntColVectorType m_perm;
@ -265,10 +266,9 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
derived().getMatrix(a); derived().getMatrix(a);
Index error; Index error;
error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, m_size, error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, internal::convert_index<StorageIndex>(m_size),
m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
manageErrorCode(error); manageErrorCode(error);
m_analysisIsOk = true; m_analysisIsOk = true;
m_factorizationIsOk = true; m_factorizationIsOk = true;
@ -287,7 +287,7 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
derived().getMatrix(a); derived().getMatrix(a);
Index error; Index error;
error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, m_size, error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, internal::convert_index<StorageIndex>(m_size),
m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
@ -306,8 +306,8 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
derived().getMatrix(a); derived().getMatrix(a);
Index error; Index error;
error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, m_size, error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, internal::convert_index<StorageIndex>(m_size),
m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL); m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
@ -354,9 +354,9 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
} }
Index error; Index error;
error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, m_size, error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, internal::convert_index<StorageIndex>(m_size),
m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
m_perm.data(), nrhs, m_iparm.data(), m_msglvl, m_perm.data(), internal::convert_index<StorageIndex>(nrhs), m_iparm.data(), m_msglvl,
rhs_ptr, x.derived().data()); rhs_ptr, x.derived().data());
manageErrorCode(error); manageErrorCode(error);
@ -371,6 +371,9 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
* using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible. * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
* The vectors or matrices X and B can be either dense or sparse. * The vectors or matrices X and B can be either dense or sparse.
* *
* By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
* \code solver.pardisoParameterArray()[59] = 1; \endcode
*
* \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
* *
* \implsparsesolverconcept * \implsparsesolverconcept
@ -421,6 +424,9 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
* using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite. * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
* The vectors or matrices X and B can be either dense or sparse. * The vectors or matrices X and B can be either dense or sparse.
* *
* By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
* \code solver.pardisoParameterArray()[59] = 1; \endcode
*
* \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
* \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used. * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
* Upper|Lower can be used to tell both triangular parts can be used as input. * Upper|Lower can be used to tell both triangular parts can be used as input.
@ -480,6 +486,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
* For complex matrices, A can also be symmetric only, see the \a Options template parameter. * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
* The vectors or matrices X and B can be either dense or sparse. * The vectors or matrices X and B can be either dense or sparse.
* *
* By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
* \code solver.pardisoParameterArray()[59] = 1; \endcode
*
* \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
* \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used. * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
* Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix. * Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.

View File

@ -419,38 +419,6 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
} }
}; };
template<typename MatrixType, typename RealScalar, typename Index>
void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
JacobiRotation<RealScalar> *j_left,
JacobiRotation<RealScalar> *j_right)
{
using std::sqrt;
using std::abs;
Matrix<RealScalar,2,2> m;
m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
JacobiRotation<RealScalar> rot1;
RealScalar t = m.coeff(0,0) + m.coeff(1,1);
RealScalar d = m.coeff(1,0) - m.coeff(0,1);
if(d == RealScalar(0))
{
rot1.s() = RealScalar(0);
rot1.c() = RealScalar(1);
}
else
{
// If d!=0, then t/d cannot overflow because the magnitude of the
// entries forming d are not too small compared to the ones forming t.
RealScalar u = t / d;
RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
rot1.s() = RealScalar(1) / tmp;
rot1.c() = u / tmp;
}
m.applyOnTheLeft(0,1,rot1);
j_right->makeJacobi(m,0,1);
*j_left = rot1 * j_right->transpose();
}
template<typename _MatrixType, int QRPreconditioner> template<typename _MatrixType, int QRPreconditioner>
struct traits<JacobiSVD<_MatrixType,QRPreconditioner> > struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
{ {

View File

@ -0,0 +1,54 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_REALSVD2X2_H
#define EIGEN_REALSVD2X2_H
namespace Eigen {
namespace internal {
template<typename MatrixType, typename RealScalar, typename Index>
void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
JacobiRotation<RealScalar> *j_left,
JacobiRotation<RealScalar> *j_right)
{
using std::sqrt;
using std::abs;
Matrix<RealScalar,2,2> m;
m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
JacobiRotation<RealScalar> rot1;
RealScalar t = m.coeff(0,0) + m.coeff(1,1);
RealScalar d = m.coeff(1,0) - m.coeff(0,1);
if(d == RealScalar(0))
{
rot1.s() = RealScalar(0);
rot1.c() = RealScalar(1);
}
else
{
// If d!=0, then t/d cannot overflow because the magnitude of the
// entries forming d are not too small compared to the ones forming t.
RealScalar u = t / d;
RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
rot1.s() = RealScalar(1) / tmp;
rot1.c() = u / tmp;
}
m.applyOnTheLeft(0,1,rot1);
j_right->makeJacobi(m,0,1);
*j_left = rot1 * j_right->transpose();
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_REALSVD2X2_H

View File

@ -247,6 +247,7 @@ tan() const
* *
* \sa tan(), asin(), acos() * \sa tan(), asin(), acos()
*/ */
EIGEN_DEVICE_FUNC
inline const AtanReturnType inline const AtanReturnType
atan() const atan() const
{ {
@ -288,6 +289,7 @@ asin() const
* *
* \sa tan(), sinh(), cosh() * \sa tan(), sinh(), cosh()
*/ */
EIGEN_DEVICE_FUNC
inline const TanhReturnType inline const TanhReturnType
tanh() const tanh() const
{ {
@ -301,6 +303,7 @@ tanh() const
* *
* \sa sin(), tanh(), cosh() * \sa sin(), tanh(), cosh()
*/ */
EIGEN_DEVICE_FUNC
inline const SinhReturnType inline const SinhReturnType
sinh() const sinh() const
{ {
@ -314,6 +317,7 @@ sinh() const
* *
* \sa tan(), sinh(), cosh() * \sa tan(), sinh(), cosh()
*/ */
EIGEN_DEVICE_FUNC
inline const CoshReturnType inline const CoshReturnType
cosh() const cosh() const
{ {
@ -331,6 +335,7 @@ cosh() const
* *
* \sa digamma() * \sa digamma()
*/ */
EIGEN_DEVICE_FUNC
inline const LgammaReturnType inline const LgammaReturnType
lgamma() const lgamma() const
{ {
@ -345,6 +350,7 @@ lgamma() const
* *
* \sa Eigen::digamma(), Eigen::polygamma(), lgamma() * \sa Eigen::digamma(), Eigen::polygamma(), lgamma()
*/ */
EIGEN_DEVICE_FUNC
inline const DigammaReturnType inline const DigammaReturnType
digamma() const digamma() const
{ {
@ -363,6 +369,7 @@ digamma() const
* *
* \sa erfc() * \sa erfc()
*/ */
EIGEN_DEVICE_FUNC
inline const ErfReturnType inline const ErfReturnType
erf() const erf() const
{ {
@ -381,6 +388,7 @@ erf() const
* *
* \sa erf() * \sa erf()
*/ */
EIGEN_DEVICE_FUNC
inline const ErfcReturnType inline const ErfcReturnType
erfc() const erfc() const
{ {
@ -436,6 +444,7 @@ cube() const
* *
* \sa ceil(), floor() * \sa ceil(), floor()
*/ */
EIGEN_DEVICE_FUNC
inline const RoundReturnType inline const RoundReturnType
round() const round() const
{ {
@ -449,6 +458,7 @@ round() const
* *
* \sa ceil(), round() * \sa ceil(), round()
*/ */
EIGEN_DEVICE_FUNC
inline const FloorReturnType inline const FloorReturnType
floor() const floor() const
{ {
@ -462,6 +472,7 @@ floor() const
* *
* \sa floor(), round() * \sa floor(), round()
*/ */
EIGEN_DEVICE_FUNC
inline const CeilReturnType inline const CeilReturnType
ceil() const ceil() const
{ {
@ -475,6 +486,7 @@ ceil() const
* *
* \sa isfinite(), isinf() * \sa isfinite(), isinf()
*/ */
EIGEN_DEVICE_FUNC
inline const IsNaNReturnType inline const IsNaNReturnType
isNaN() const isNaN() const
{ {
@ -488,6 +500,7 @@ isNaN() const
* *
* \sa isnan(), isfinite() * \sa isnan(), isfinite()
*/ */
EIGEN_DEVICE_FUNC
inline const IsInfReturnType inline const IsInfReturnType
isInf() const isInf() const
{ {
@ -501,6 +514,7 @@ isInf() const
* *
* \sa isnan(), isinf() * \sa isnan(), isinf()
*/ */
EIGEN_DEVICE_FUNC
inline const IsFiniteReturnType inline const IsFiniteReturnType
isFinite() const isFinite() const
{ {

View File

@ -44,4 +44,8 @@ before-evaluators
7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) 7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
7591:09a8e2186610 # 3.3-alpha1 7591:09a8e2186610 # 3.3-alpha1
7650:b0f3c8f43025 # help clang inlining 7650:b0f3c8f43025 # help clang inlining
8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
8985:d935df21a082 # Remove the rotating kernel.

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr> // Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
// //
// This Source Code Form is subject to the terms of the Mozilla // This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
@ -10,6 +10,7 @@
#include "main.h" #include "main.h"
#include <limits> #include <limits>
#include <Eigen/Eigenvalues> #include <Eigen/Eigenvalues>
#include <Eigen/LU>
template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m) template<typename MatrixType> void generalized_eigensolver_real(const MatrixType& m)
{ {
@ -21,6 +22,7 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
Index cols = m.cols(); Index cols = m.cols();
typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::Scalar Scalar;
typedef std::complex<Scalar> ComplexScalar;
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType; typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
MatrixType a = MatrixType::Random(rows,cols); MatrixType a = MatrixType::Random(rows,cols);
@ -31,14 +33,28 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
MatrixType spdB = b.adjoint() * b + b1.adjoint() * b1; MatrixType spdB = b.adjoint() * b + b1.adjoint() * b1;
// lets compare to GeneralizedSelfAdjointEigenSolver // lets compare to GeneralizedSelfAdjointEigenSolver
GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB); {
GeneralizedEigenSolver<MatrixType> eig(spdA, spdB); GeneralizedSelfAdjointEigenSolver<MatrixType> symmEig(spdA, spdB);
GeneralizedEigenSolver<MatrixType> eig(spdA, spdB);
VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0); VERIFY_IS_EQUAL(eig.eigenvalues().imag().cwiseAbs().maxCoeff(), 0);
VectorType realEigenvalues = eig.eigenvalues().real(); VectorType realEigenvalues = eig.eigenvalues().real();
std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size()); std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues()); VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
}
// non symmetric case:
{
GeneralizedEigenSolver<MatrixType> eig(a,b);
for(Index k=0; k<cols; ++k)
{
Matrix<ComplexScalar,Dynamic,Dynamic> tmp = (eig.betas()(k)*a).template cast<ComplexScalar>() - eig.alphas()(k)*b;
if(tmp.norm()>(std::numeric_limits<Scalar>::min)())
tmp /= tmp.norm();
VERIFY_IS_MUCH_SMALLER_THAN( std::abs(tmp.determinant()), Scalar(1) );
}
}
// regression test for bug 1098 // regression test for bug 1098
{ {
@ -57,7 +73,7 @@ void test_eigensolver_generalized_real()
s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4); s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) ); CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(s,s)) );
// some trivial but implementation-wise tricky cases // some trivial but implementation-wise special cases
CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) ); CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(1,1)) );
CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) ); CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) );
CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) ); CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) );

View File

@ -58,6 +58,8 @@ template<typename Scalar,int Size> void homogeneous(void)
T2MatrixType t2 = T2MatrixType::Random(); T2MatrixType t2 = T2MatrixType::Random();
VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous()); VERIFY_IS_APPROX(t2 * (v0.homogeneous().eval()), t2 * v0.homogeneous());
VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous()); VERIFY_IS_APPROX(t2 * (m0.colwise().homogeneous().eval()), t2 * m0.colwise().homogeneous());
VERIFY_IS_APPROX(t2 * (v0.homogeneous().asDiagonal()), t2 * hv0.asDiagonal());
VERIFY_IS_APPROX((v0.homogeneous().asDiagonal()) * t2, hv0.asDiagonal() * t2);
VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2, VERIFY_IS_APPROX((v0.transpose().rowwise().homogeneous().eval()) * t2,
v0.transpose().rowwise().homogeneous() * t2); v0.transpose().rowwise().homogeneous() * t2);

View File

@ -49,11 +49,20 @@ template<typename MatrixType> void real_qz(const MatrixType& m)
for (Index i=0; i<A.cols(); i++) for (Index i=0; i<A.cols(); i++)
for (Index j=0; j<i; j++) { for (Index j=0; j<i; j++) {
if (abs(qz.matrixT()(i,j))!=Scalar(0.0)) if (abs(qz.matrixT()(i,j))!=Scalar(0.0))
{
std::cerr << "Error: T(" << i << "," << j << ") = " << qz.matrixT()(i,j) << std::endl;
all_zeros = false; all_zeros = false;
}
if (j<i-1 && abs(qz.matrixS()(i,j))!=Scalar(0.0)) if (j<i-1 && abs(qz.matrixS()(i,j))!=Scalar(0.0))
{
std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j) << std::endl;
all_zeros = false; all_zeros = false;
}
if (j==i-1 && j>0 && abs(qz.matrixS()(i,j))!=Scalar(0.0) && abs(qz.matrixS()(i-1,j-1))!=Scalar(0.0)) if (j==i-1 && j>0 && abs(qz.matrixS()(i,j))!=Scalar(0.0) && abs(qz.matrixS()(i-1,j-1))!=Scalar(0.0))
{
std::cerr << "Error: S(" << i << "," << j << ") = " << qz.matrixS()(i,j) << " && S(" << i-1 << "," << j-1 << ") = " << qz.matrixS()(i-1,j-1) << std::endl;
all_zeros = false; all_zeros = false;
}
} }
VERIFY_IS_EQUAL(all_zeros, true); VERIFY_IS_EQUAL(all_zeros, true);
VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixS()*qz.matrixZ(), A); VERIFY_IS_APPROX(qz.matrixQ()*qz.matrixS()*qz.matrixZ(), A);

View File

@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// across k dimension. // across k dimension.
const TensorOpCost cost = const TensorOpCost cost =
contractionCost(m, n, bm, bn, bk, shard_by_col, false); contractionCost(m, n, bm, bn, bk, shard_by_col, false);
Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
static_cast<double>(n) * m, cost, this->m_device.numThreads()); static_cast<double>(n) * m, cost, this->m_device.numThreads());
// TODO(dvyukov): this is a stop-gap to prevent regressions while the cost // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
class Context { class Context {
public: public:
Context(const Device& device, int num_threads, LhsMapper& lhs, Context(const Device& device, int num_threads, LhsMapper& lhs,
RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm, RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
Index gn, Index nm0, Index nn0, bool shard_by_col, Index gn, Index nm0, Index nn0, bool shard_by_col,
bool parallel_pack) bool parallel_pack)
@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
lhs_(lhs), lhs_(lhs),
rhs_(rhs), rhs_(rhs),
buffer_(buffer), buffer_(buffer),
output_(buffer, m), output_(buffer, tm),
num_threads_(num_threads), num_threads_(num_threads),
shard_by_col_(shard_by_col), shard_by_col_(shard_by_col),
parallel_pack_(parallel_pack), parallel_pack_(parallel_pack),
m_(m), m_(tm),
n_(n), n_(tn),
k_(k), k_(tk),
bm_(bm), bm_(bm),
bn_(bn), bn_(bn),
bk_(bk), bk_(bk),

View File

@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
// Build a thread pool device on top the an existing pool of threads. // Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice { struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller. // The ownership of the thread pool remains with the caller.
ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { } ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes); return internal::aligned_malloc(num_bytes);
@ -130,7 +130,7 @@ struct ThreadPoolDevice {
::memset(buffer, c, n); ::memset(buffer, c, n);
} }
EIGEN_STRONG_INLINE size_t numThreads() const { EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_; return num_threads_;
} }
@ -182,7 +182,7 @@ struct ThreadPoolDevice {
std::function<void(Index, Index)> f) const { std::function<void(Index, Index)> f) const {
typedef TensorCostModel<ThreadPoolDevice> CostModel; typedef TensorCostModel<ThreadPoolDevice> CostModel;
if (n <= 1 || numThreads() == 1 || if (n <= 1 || numThreads() == 1 ||
CostModel::numThreads(n, cost, numThreads()) == 1) { CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
f(0, n); f(0, n);
return; return;
} }
@ -242,7 +242,7 @@ struct ThreadPoolDevice {
// Recursively divide size into halves until we reach block_size. // Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get // Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations. // block_count leaves that do actual computations.
Barrier barrier(block_count); Barrier barrier(static_cast<unsigned int>(block_count));
std::function<void(Index, Index)> handleRange; std::function<void(Index, Index)> handleRange;
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
if (last - first <= block_size) { if (last - first <= block_size) {
@ -268,7 +268,7 @@ struct ThreadPoolDevice {
private: private:
ThreadPoolInterface* pool_; ThreadPoolInterface* pool_;
size_t num_threads_; int num_threads_;
}; };

View File

@ -426,6 +426,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
m_arg3Impl(op.arg3Expression(), device) m_arg3Impl(op.arg3Expression(), device)
{ {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
typename internal::traits<Arg2Type>::StorageKind>::value),
STORAGE_KIND_MUST_MATCH)
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
typename internal::traits<Arg3Type>::StorageKind>::value),
STORAGE_KIND_MUST_MATCH)
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
typename internal::traits<Arg2Type>::Index>::value),
STORAGE_INDEX_MUST_MATCH)
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
typename internal::traits<Arg3Type>::Index>::value),
STORAGE_INDEX_MUST_MATCH)
eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
} }

View File

@ -227,22 +227,6 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
TernaryOp(typename Arg1XprType::Scalar, TernaryOp(typename Arg1XprType::Scalar,
typename Arg2XprType::Scalar, typename Arg2XprType::Scalar,
typename Arg3XprType::Scalar)>::type Scalar; typename Arg3XprType::Scalar)>::type Scalar;
EIGEN_STATIC_ASSERT(
(internal::is_same<typename traits<Arg1XprType>::StorageKind,
typename traits<Arg2XprType>::StorageKind>::value),
STORAGE_KIND_MUST_MATCH)
EIGEN_STATIC_ASSERT(
(internal::is_same<typename traits<Arg1XprType>::StorageKind,
typename traits<Arg3XprType>::StorageKind>::value),
STORAGE_KIND_MUST_MATCH)
EIGEN_STATIC_ASSERT(
(internal::is_same<typename traits<Arg1XprType>::Index,
typename traits<Arg2XprType>::Index>::value),
STORAGE_INDEX_MUST_MATCH)
EIGEN_STATIC_ASSERT(
(internal::is_same<typename traits<Arg1XprType>::Index,
typename traits<Arg3XprType>::Index>::value),
STORAGE_INDEX_MUST_MATCH)
typedef traits<Arg1XprType> XprTraits; typedef traits<Arg1XprType> XprTraits;
typedef typename traits<Arg1XprType>::StorageKind StorageKind; typedef typename traits<Arg1XprType>::StorageKind StorageKind;
typedef typename traits<Arg1XprType>::Index Index; typedef typename traits<Arg1XprType>::Index Index;

View File

@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
}; };
template<typename Reducer, typename Device>
struct reducer_traits {
enum {
Cost = 1,
PacketAccess = false
};
};
// Standard reduction functors // Standard reduction functors
template <typename T> struct SumReducer template <typename T> struct SumReducer
{ {
@ -119,6 +127,15 @@ template <typename T> struct SumReducer
} }
}; };
template <typename T, typename Device>
struct reducer_traits<SumReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasAdd
};
};
template <typename T> struct MeanReducer template <typename T> struct MeanReducer
{ {
static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger; static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_; DenseIndex packetCount_;
}; };
template <typename T, typename Device>
struct reducer_traits<MeanReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasAdd
};
};
template <typename T> struct MaxReducer template <typename T> struct MaxReducer
{ {
static const bool PacketAccess = packet_traits<T>::HasMax; static const bool PacketAccess = packet_traits<T>::HasMax;
@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
} }
}; };
template <typename T, typename Device>
struct reducer_traits<MaxReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasMax
};
};
template <typename T> struct MinReducer template <typename T> struct MinReducer
{ {
static const bool PacketAccess = packet_traits<T>::HasMin; static const bool PacketAccess = packet_traits<T>::HasMin;
@ -228,6 +263,14 @@ template <typename T> struct MinReducer
} }
}; };
template <typename T, typename Device>
struct reducer_traits<MinReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasMin
};
};
template <typename T> struct ProdReducer template <typename T> struct ProdReducer
{ {
@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
} }
}; };
template <typename T, typename Device>
struct reducer_traits<ProdReducer<T>, Device> {
enum {
Cost = NumTraits<T>::MulCost,
PacketAccess = PacketType<T, Device>::HasMul
};
};
struct AndReducer struct AndReducer
{ {
@ -280,6 +331,15 @@ struct AndReducer
} }
}; };
template <typename Device>
struct reducer_traits<AndReducer, Device> {
enum {
Cost = 1,
PacketAccess = false
};
};
struct OrReducer { struct OrReducer {
static const bool PacketAccess = false; static const bool PacketAccess = false;
static const bool IsStateful = false; static const bool IsStateful = false;
@ -295,6 +355,15 @@ struct OrReducer {
} }
}; };
template <typename Device>
struct reducer_traits<OrReducer, Device> {
enum {
Cost = 1,
PacketAccess = false
};
};
// Argmin/Argmax reducers // Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer template <typename T> struct ArgMaxTupleReducer
{ {
@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
} }
}; };
template <typename T, typename Device>
struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = false
};
};
template <typename T> struct ArgMinTupleReducer template <typename T> struct ArgMinTupleReducer
{ {
static const bool PacketAccess = false; static const bool PacketAccess = false;
@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
} }
}; };
template <typename T, typename Device>
struct reducer_traits<ArgMinTupleReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = false
};
};
// Random number generation // Random number generation
namespace { namespace {

View File

@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
// Default packet types // Default packet types
template <typename Scalar, typename Device> template <typename Scalar, typename Device>
struct PacketType { struct PacketType : internal::packet_traits<Scalar> {
typedef typename internal::packet_traits<Scalar>::type type; typedef typename internal::packet_traits<Scalar>::type type;
enum { size = internal::unpacket_traits<type>::size };
}; };
// For CUDA packet types when using a GpuDevice // For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
template <> template <>
struct PacketType<float, GpuDevice> { struct PacketType<half, GpuDevice> {
typedef float4 type; typedef half2 type;
static const int size = 4;
};
template <>
struct PacketType<double, GpuDevice> {
typedef double2 type;
static const int size = 2; static const int size = 2;
enum {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 1,
HasArg = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0,
HasBlend = 0,
HasDiv = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasExp = 1,
HasLog = 1,
HasLog1p = 0,
HasLog10 = 0,
HasPow = 1,
};
}; };
#endif #endif

View File

@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); } EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected: protected:
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;

View File

@ -130,15 +130,17 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
if (block == 0) { if (block == 0) {
// We're the first block to run, initialize the output value // We're the first block to run, initialize the output value
atomicExch(output, reducer.initialize()); atomicExch(output, reducer.initialize());
unsigned int old = atomicExch(semaphore, 2u); __threadfence();
assert(old == 1u); atomicExch(semaphore, 2u);
} }
else { else {
// Wait for the first block to initialize the output value.
// Use atomicCAS here to ensure that the reads aren't cached // Use atomicCAS here to ensure that the reads aren't cached
unsigned int val = atomicCAS(semaphore, 2u, 2u); unsigned int val;
while (val < 2u) { do {
val = atomicCAS(semaphore, 2u, 2u); val = atomicCAS(semaphore, 2u, 2u);
} }
while (val < 2u);
} }
} }
} }
@ -166,12 +168,8 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
} }
if (gridDim.x > 1 && threadIdx.x == 0) { if (gridDim.x > 1 && threadIdx.x == 0) {
unsigned int ticket = atomicInc(semaphore, UINT_MAX); // Let the last block reset the semaphore
assert(ticket >= 2u); atomicInc(semaphore, gridDim.x + 1);
if (ticket == gridDim.x + 1) {
// We're the last block, reset the semaphore
*semaphore = 0;
}
} }
} }
@ -330,10 +328,10 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex, // Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case // so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats. // of floats and half floats.
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else #else
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value; internal::is_same<typename Self::CoeffReturnType, float>::value;
@ -348,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
return; return;
} }
FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs); FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
} }
}; };
@ -610,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else #else
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value; internal::is_same<typename Self::CoeffReturnType, float>::value;
@ -629,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true; return true;
} }
return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
} }
}; };

View File

@ -81,7 +81,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
typedef typename XprType::Index Index; typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions; typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar; typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
m_output(NULL) { m_output(NULL) {
// Accumulating a scalar isn't supported. // Accumulating a scalar isn't supported.
EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(m_axis >= 0 && m_axis < NumDims); eigen_assert(m_axis >= 0 && m_axis < NumDims);
// Compute stride of scan axis // Compute stride of scan axis
@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_dimensions; return m_dimensions;
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return true; return true;
} }
} }
template<int LoadMode> template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
@ -152,6 +152,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
return m_output[index]; return m_output[index];
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
if (m_output != NULL) { if (m_output != NULL) {
m_device.deallocate(m_output); m_device.deallocate(m_output);