From 8290e21fb5c96f8d9f18b419eb0c17ae99c05f23 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 17 Nov 2016 13:21:15 -0500 Subject: [PATCH 001/680] replace sizeof(Packet) with PacketSize else it breaks for ZVector.Packet4f --- Eigen/src/Core/arch/ZVector/Complex.h | 230 +++++++++++- Eigen/src/Core/arch/ZVector/MathFunctions.h | 69 ++-- Eigen/src/Core/arch/ZVector/PacketMath.h | 386 +++++++++++++++++++- test/packetmath.cpp | 2 +- 4 files changed, 646 insertions(+), 41 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index e9d83eca6..d39d2d105 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -24,13 +25,48 @@ struct Packet1cd Packet2d v; }; +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + union { + Packet4f v; + Packet1cd cd[2]; + }; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasBlend = 1, + HasSetLinear = 0 + }; +}; + + template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; enum { Vectorizable = 1, - AlignedOnScalar = 0, + AlignedOnScalar = 1, size = 1, HasHalfPacket = 0, @@ -47,20 +83,68 @@ template<> struct packet_traits > : default_packet_traits }; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +/* Forward declaration */ +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); + +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) +{ + return pload(from); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) +{ + pstore >(to, from); +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { @@ -79,43 +163,90 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } - -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { - return pset1(*from); + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; } +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); + std::complex EIGEN_ALIGN16 res; + pstore >(&res, a); + + return res; +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); return res[0]; } template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd(transpose.packet[0], transpose.packet[1]); +} template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} template struct palign_impl @@ -127,6 +258,18 @@ struct palign_impl } }; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -160,6 +303,39 @@ template<> struct conj_helper } }; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec @@ -168,17 +344,49 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); } +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res; + res.cd[0] = pdiv(a.cd[0], b.cd[0]); + res.cd[1] = pdiv(a.cd[1], b.cd[1]); + return res; +} + EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { return Packet1cd(preverse(Packet2d(x.v))); } +EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) +{ + Packet2cf res; + res.cd[0] = pcplxflip(x.cd[0]); + res.cd[1] = pcplxflip(x.cd[1]); + return res; +} + EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); kernel.packet[0].v = tmp; } + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet1cd tmp = kernel.packet[0].cd[1]; + kernel.packet[0].cd[1] = kernel.packet[1].cd[0]; + kernel.packet[1].cd[0] = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] }; + result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); + return result; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 6fff8524e..5c7aa7256 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -3,6 +3,7 @@ // // Copyright (C) 2007 Julien Pommier // Copyright (C) 2009 Gael Guennebaud +// Copyright (C) 2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -19,32 +20,32 @@ namespace Eigen { namespace internal { +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) { Packet2d x = _x; - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - Packet2d tmp, fx; Packet2l emm0; @@ -91,18 +92,44 @@ Packet2d pexp(const Packet2d& _x) isnumber_mask); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = pexp(x.v4f[0]); + res.v4f[1] = pexp(x.v4f[1]); + return res; +} + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { return __builtin_s390_vfsqdb(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = psqrt(x.v4f[0]); + res.v4f[1] = psqrt(x.v4f[1]); + return res; +} + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) { + Packet4f res; + res.v4f[0] = prsqrt(x.v4f[0]); + res.v4f[1] = prsqrt(x.v4f[1]); + return res; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 5a7226be6..e2deb25c8 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -28,9 +28,8 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif -// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif typedef __vector int Packet4i; @@ -42,6 +41,10 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +typedef struct { + Packet2d v4f[2]; +} Packet4f; + typedef union { int32_t i[4]; uint32_t ui[4]; @@ -88,6 +91,7 @@ static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; @@ -132,13 +136,11 @@ template<> struct packet_traits : default_packet_traits typedef Packet4i type; typedef Packet4i half; enum { - // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, size = 4, HasHalfPacket = 0, - // FIXME check the Has* HasAdd = 1, HasSub = 1, HasMul = 1, @@ -147,6 +149,37 @@ template<> struct packet_traits : default_packet_traits }; }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + template<> struct packet_traits : default_packet_traits { typedef Packet2d type; @@ -157,7 +190,6 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 1, - // FIXME check the Has* HasAdd = 1, HasSub = 1, HasMul = 1, @@ -180,8 +212,12 @@ template<> struct packet_traits : default_packet_traits }; template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +/* Forward declaration */ +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); + inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) { Packet vt; @@ -222,6 +258,32 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + template struct palign_impl { @@ -238,6 +300,31 @@ struct palign_impl } }; +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + + template struct palign_impl { @@ -257,6 +344,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -275,6 +372,15 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + + template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -288,10 +394,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vec_splats(from); } - template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -304,6 +416,17 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -326,6 +449,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -344,6 +477,16 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -353,52 +496,160 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -408,6 +659,14 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -415,12 +674,15 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -433,8 +695,23 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -453,6 +730,13 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { @@ -493,6 +777,21 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) return sum; } +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) @@ -507,6 +806,12 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -521,6 +826,14 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -536,6 +849,14 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -556,12 +877,61 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one + */ +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock t0,t1,t2,t3; + // copy top-left 2x2 Packet2d block + t0.packet[0] = kernel.packet[0].v4f[0]; + t0.packet[1] = kernel.packet[1].v4f[0]; + + // copy top-right 2x2 Packet2d block + t1.packet[0] = kernel.packet[0].v4f[1]; + t1.packet[1] = kernel.packet[1].v4f[1]; + + // copy bottom-left 2x2 Packet2d block + t2.packet[0] = kernel.packet[2].v4f[0]; + t2.packet[1] = kernel.packet[3].v4f[0]; + + // copy bottom-right 2x2 Packet2d block + t3.packet[0] = kernel.packet[2].v4f[1]; + t3.packet[1] = kernel.packet[3].v4f[1]; + + // Transpose all 2x2 blocks + ptranspose(t0); + ptranspose(t1); + ptranspose(t2); + ptranspose(t3); + + // Copy back transposed blocks, but exchange t1 and t2 due to transposition + kernel.packet[0].v4f[0] = t0.packet[0]; + kernel.packet[0].v4f[1] = t2.packet[0]; + kernel.packet[1].v4f[0] = t0.packet[1]; + kernel.packet[1].v4f[1] = t2.packet[1]; + kernel.packet[2].v4f[0] = t1.packet[0]; + kernel.packet[2].v4f[1] = t3.packet[0]; + kernel.packet[3].v4f[0] = t1.packet[1]; + kernel.packet[3].v4f[1] = t3.packet[1]; +} + template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; + Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast(p2l_ONE)); + Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast(p2l_ONE)); + Packet4f result; + result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi); + result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); + return result; +} + template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index c18d73496..7821a1738 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -591,7 +591,7 @@ template void packetmath_scatter_gather() int stride = internal::random(1,20); EIGEN_ALIGN_MAX Scalar buffer[PacketSize*20]; - memset(buffer, 0, 20*sizeof(Packet)); + memset(buffer, 0, 20*PacketSize*sizeof(Scalar)); Packet packet = internal::pload(data1); internal::pscatter(buffer, packet, stride); From bbd97b4095ff9cbe9898d68b3ab7bdff8125f3fb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 01:02:51 +0200 Subject: [PATCH 002/680] Add a EIGEN_NO_CUDA option, and introduce EIGEN_CUDACC and EIGEN_CUDA_ARCH aliases --- Eigen/Core | 18 +++++-- Eigen/src/Core/GeneralProduct.h | 4 +- Eigen/src/Core/GenericPacketMath.h | 4 +- Eigen/src/Core/MathFunctions.h | 48 +++++++++---------- Eigen/src/Core/MatrixBase.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 4 +- Eigen/src/Core/arch/CUDA/Complex.h | 2 +- Eigen/src/Core/arch/CUDA/Half.h | 30 ++++++------ Eigen/src/Core/arch/CUDA/MathFunctions.h | 2 +- Eigen/src/Core/arch/CUDA/PacketMath.h | 10 ++-- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 28 +++++------ Eigen/src/Core/arch/CUDA/TypeCasting.h | 8 ++-- Eigen/src/Core/functors/AssignmentFunctors.h | 2 +- Eigen/src/Core/util/Macros.h | 6 +-- Eigen/src/Core/util/Meta.h | 8 ++-- Eigen/src/SVD/BDCSVD.h | 2 +- Eigen/src/SparseQR/SparseQR.h | 2 +- .../CXX11/src/Tensor/TensorContractionCuda.h | 4 +- .../CXX11/src/Tensor/TensorConvolution.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 12 ++--- .../CXX11/src/Tensor/TensorDeviceDefault.h | 10 ++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 12 ++--- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 6 +-- .../CXX11/src/Tensor/TensorReductionCuda.h | 10 ++-- .../Eigen/CXX11/src/Tensor/TensorScan.h | 4 +- .../Eigen/CXX11/src/util/EmulateArray.h | 2 +- .../SpecialFunctions/SpecialFunctionsImpl.h | 4 +- .../arch/CUDA/CudaSpecialFunctions.h | 2 +- 33 files changed, 134 insertions(+), 126 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index a16942e19..d2edbd2bc 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -14,8 +14,16 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDACC __CUDACC__ +#endif + +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + // Handle NVCC/CUDA/SYCL -#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) +#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) // Do not try asserts on CUDA and SYCL! #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG @@ -30,7 +38,7 @@ #endif // All functions callable from CUDA code must be qualified with __device__ - #ifdef __CUDACC__ + #ifdef EIGEN_CUDACC // Do not try to vectorize on CUDA and SYCL! #ifndef EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE @@ -50,13 +58,13 @@ // When compiling CUDA device code with NVCC, pull in math functions from the // global namespace. In host mode, and when device doee with clang, use the // std versions. -#if defined(__CUDA_ARCH__) && defined(__NVCC__) +#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) #define EIGEN_EXCEPTIONS #endif @@ -233,7 +241,7 @@ #define EIGEN_HAS_FP16_C #endif -#if defined __CUDACC__ +#if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_CUDA #include #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index b206b0a7a..95dadfea8 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -379,7 +379,7 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template template @@ -412,7 +412,7 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // __CUDACC__ +#endif // EIGEN_CUDACC /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d19d5bbd2..30878eda6 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -299,7 +299,7 @@ template EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -526,7 +526,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 75f34aa91..a906b7629 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH template struct imag_impl > { @@ -778,7 +778,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #ifdef EIGEN_CUDA_ARCH return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -793,7 +793,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #ifdef EIGEN_CUDA_ARCH return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -808,7 +808,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #ifdef EIGEN_CUDA_ARCH return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -874,7 +874,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1088,7 +1088,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1146,7 +1146,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1167,7 +1167,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1225,7 +1225,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1253,7 +1253,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1283,7 +1283,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1303,7 +1303,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1323,7 +1323,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1343,7 +1343,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1363,7 +1363,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1393,7 +1393,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1422,7 +1422,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1451,7 +1451,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1472,7 +1472,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1492,7 +1492,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1510,12 +1510,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1535,7 +1535,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 200e57741..908d35dfe 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -160,7 +160,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC template EIGEN_DEVICE_FUNC const Product diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index c42725dbd..86966abdb 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -851,7 +851,7 @@ struct product_evaluator, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -895,7 +895,7 @@ struct product_evaluator, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index ca0aaed32..57d1201f4 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, clang does not treat them as device diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index e4e639fcd..e99847e24 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -140,7 +140,7 @@ struct half : public half_impl::half_base { namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 @@ -281,7 +281,7 @@ union FP32 { }; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __float2half(ff); #elif defined(EIGEN_HAS_FP16_C) @@ -336,7 +336,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) @@ -370,7 +370,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hexp(a)); #else return half(::expf(float(a))); @@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return half(::hlog(a)); #else return half(::logf(float(a))); @@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -446,7 +446,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -455,7 +455,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -610,14 +610,14 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { return static_cast(__shfl_xor(static_cast(var), laneMask, width)); } #endif // ldg() has an overload for __half, but we also need one for Eigen::half. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -625,7 +625,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) namespace Eigen { namespace numext { diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h index 987a5291c..ff6256ce0 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog(const float4& a) { diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 8c46af09b..97a8abe59 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -196,7 +196,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return __ldg((const float4*)from); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -204,7 +204,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return __ldg((const double2*)from); #else return make_double2(from[0], from[1]); @@ -213,7 +213,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -221,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); #else return make_double2(from[0], from[1]); diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index f4ae3c3c5..4a730a0e0 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { // Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 template<> struct is_arithmetic { enum { value = true }; }; @@ -69,7 +69,7 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 +#if EIGEN_CUDA_ARCH >= 350 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); @@ -78,7 +78,7 @@ template<> template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 +#if EIGEN_CUDA_ARCH >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); @@ -116,7 +116,7 @@ ptranspose(PacketBlock& kernel) { } template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; @@ -125,7 +125,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& } template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hadd2(a, b); #else float a1 = __low2float(a); @@ -139,7 +139,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hsub2(a, b); #else float a1 = __low2float(a); @@ -153,7 +153,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hneg2(a); #else float a1 = __low2float(a); @@ -165,7 +165,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hmul2(a, b); #else float a1 = __low2float(a); @@ -179,7 +179,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hfma2(a, b, c); #else float a1 = __low2float(a); @@ -225,7 +225,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hadd(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); @@ -235,7 +235,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); return __hgt(first, second) ? first : second; @@ -247,7 +247,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); return __hlt(first, second) ? first : second; @@ -259,7 +259,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if __CUDA_ARCH__ >= 530 +#if EIGEN_CUDA_ARCH >= 530 return __hmul(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); @@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index aa5fbce8e..30f870c3d 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -19,7 +19,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __float2half(a); #else return Eigen::half(a); @@ -37,7 +37,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __float2half(static_cast(a)); #else return Eigen::half(static_cast(a)); @@ -55,7 +55,7 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(a); #else return static_cast(a); @@ -69,7 +69,7 @@ struct functor_traits > -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 template <> struct type_casting_traits { diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 4153b877c..1077d8eb0 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 755646795..322e8d899 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -427,7 +427,7 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR -#if defined(__CUDACC__) +#if defined(EIGEN_CUDACC) // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 @@ -669,7 +669,7 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if (defined __CUDACC__) +#if (defined EIGEN_CUDACC) #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) #elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) @@ -990,7 +990,7 @@ namespace Eigen { # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef __CUDA_ARCH__ +# ifdef EIGEN_CUDA_ARCH # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") # else diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 8de605500..0fa818008 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -11,7 +11,7 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) #include #include #endif @@ -169,7 +169,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -523,13 +523,13 @@ template struct scalar_product_traits namespace numext { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) using internal::device::numeric_limits; #else using std::numeric_limits; diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index d7a4271cb..b8c41c560 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1211,7 +1211,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif }//end deflation -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 2d4498b03..f7111fe2e 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -327,7 +327,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data()); m_isEtreeOk = true; - m_R.resize(m, n); + m_R.resize(diagSize, n); m_Q.resize(m, diagSize); // Allocate space for nonzero elements : rough estimation diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index c04b784a4..428b18499 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -12,7 +12,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) namespace Eigen { @@ -1382,5 +1382,5 @@ struct TensorEvaluator struct GetKernelSize { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index be8d69386..ded7129da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -211,7 +211,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) @@ -239,7 +239,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); @@ -265,7 +265,7 @@ struct GpuDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH) cudaError_t err = cudaStreamSynchronize(stream_->stream()); if (err != cudaSuccess) { std::cerr << "Error detected in CUDA stream: " @@ -304,7 +304,7 @@ struct GpuDevice { // This function checks if the CUDA runtime recorded an error for the // underlying stream device. inline bool ok() const { -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC cudaError_t error = cudaStreamQuery(stream_->stream()); return (error == cudaSuccess) || (error == cudaErrorNotReady); #else @@ -323,9 +323,9 @@ struct GpuDevice { // FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ +#ifdef EIGEN_CUDACC static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH cudaError_t status = cudaDeviceSetSharedMemConfig(config); EIGEN_UNUSED_VARIABLE(status) assert(status == cudaSuccess); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index ccaaa6cb2..341889e88 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,7 +35,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH // Running on the host CPU return 1; #else @@ -45,7 +45,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +55,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) // Running single threaded on the host CPU return l3CacheSize(); #else @@ -65,13 +65,13 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ +#ifndef EIGEN_CUDA_ARCH // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; #else // Running on a CUDA device - return __CUDA_ARCH__ / 100; + return EIGEN_CUDA_ARCH / 100; #endif } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index fcf330b10..2264be391 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -131,7 +131,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f01d77c0a..0ffe68ab3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor { }; -#if defined(__CUDACC__) +#if defined(EIGEN_CUDACC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -264,7 +264,7 @@ inline void TensorExecutor::run( evaluator.cleanup(); } -#endif // __CUDACC__ +#endif // EIGEN_CUDACC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ef1c9c42c..fb6454623 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -35,7 +35,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return __clz(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -53,7 +53,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return __clzll(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -90,7 +90,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) return __umulhi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -101,7 +101,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) return __umul64hi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -124,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -203,7 +203,7 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH return (__umulhi(magic, n) >> shift); #elif defined(__SYCL_DEVICE_ONLY__) return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index f92e39d69..c9e61f359 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef __CUDACC__ +#ifndef EIGEN_CUDACC #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 77c9c6c6e..5431eb740 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) template <> struct PacketType { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index f108c349f..230915db2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -16,7 +16,7 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ +#ifdef EIGEN_CUDA_ARCH // We don't support 3d kernels since we currently only use 1 and // 2d kernels. assert(threadIdx.z == 0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 69079805d..da0ffe728 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -334,7 +334,7 @@ struct OuterReducer { }; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); @@ -694,7 +694,7 @@ struct TensorEvaluator, #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); @@ -781,7 +781,7 @@ struct TensorEvaluator, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 24a55a3d5..974eb7deb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) // Full reducers for GPU, don't vectorize for now // Reducer function that enables multiple cuda thread to safely accumulate at the same @@ -23,7 +23,7 @@ namespace internal { // updated the content of the output address it will try again. template __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 if (sizeof(T) == 4) { unsigned int oldval = *reinterpret_cast(output); @@ -102,7 +102,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer&) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 atomicAdd(output, accum); #else // __CUDA_ARCH__ >= 300 assert(0 && "Shouldn't be called on unsupported device"); @@ -124,7 +124,7 @@ template __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output, unsigned int* semaphore) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; if (gridDim.x == 1) { @@ -372,7 +372,7 @@ template __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { -#if __CUDA_ARCH__ >= 300 +#if EIGEN_CUDA_ARCH >= 300 typedef typename Self::CoeffReturnType Type; eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 2a85ed840..1f545ef1a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -242,7 +242,7 @@ struct ScanLauncher { } }; -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) // GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in @@ -281,7 +281,7 @@ struct ScanLauncher { LAUNCH_CUDA_KERNEL((ScanKernel), num_blocks, block_size, 0, self.device(), self, total_size, data); } }; -#endif // EIGEN_USE_GPU && __CUDACC__ +#endif // EIGEN_USE_GPU && EIGEN_CUDACC } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 573ca435a..96b3a8261 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template class array { diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 369ad97b4..5d1b8fcc2 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -121,7 +121,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgammaf_r(x, &dummy); #else @@ -134,7 +134,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) +#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgamma_r(x, &dummy); #else diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h index ec4fa8448..e0e3a8be6 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma(const float4& a) From 9f8136ff747088b9a74c89ab0b3d89ac1081e83d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 10:43:18 +0200 Subject: [PATCH 003/680] disable nvcc boolean-expr-is-constant warning --- Eigen/src/Core/util/DisableStupidWarnings.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index b91d1d1af..8ef0f3594 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -55,6 +55,7 @@ #endif #if defined __NVCC__ + #pragma diag_suppress boolean_controlling_expr_is_constant // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message From 3182bdbae68b11aa8278107ca71c5df2de789e66 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 11:01:28 +0200 Subject: [PATCH 004/680] Disable vectorization when compiled by nvcc, even is EIGEN_NO_CUDA is defined --- Eigen/Core | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index d2edbd2bc..f613ee9aa 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -55,6 +55,10 @@ #define EIGEN_DEVICE_FUNC #endif +#ifdef __NVCC__ +#define EIGEN_DONT_VECTORIZE +#endif + // When compiling CUDA device code with NVCC, pull in math functions from the // global namespace. In host mode, and when device doee with clang, use the // std versions. From a74b9ba7cda08b8fbcb187aa5e96f0e99cf9b684 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 11:05:26 +0200 Subject: [PATCH 005/680] Update documentation for CUDA --- doc/PreprocessorDirectives.dox | 2 ++ doc/UsingNVCC.dox | 12 +++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index f01b39aec..0919d4190 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -120,6 +120,8 @@ run time. However, these assertions do cost time and can thus be turned off. - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. + - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only, + and never called from device code. - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default. diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox index f8e755b79..9bcdf0bfc 100644 --- a/doc/UsingNVCC.dox +++ b/doc/UsingNVCC.dox @@ -3,18 +3,16 @@ namespace Eigen { /** \page TopicCUDA Using Eigen in CUDA kernels -\b Disclaimer: this page is about an \b experimental feature in %Eigen. - -Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost). -A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels. -To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC. +Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code. +This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header. +This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only. +However, in both cases, host's SIMD vectorization has to be disabled in .cu files. +It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files. Known issues: - \c nvcc with MS Visual Studio does not work (patch welcome) - - \c nvcc with \c clang does not work (patch welcome) - - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \ header file. To workaround this, you can add the following before including any other files: \code // workaround issue between gcc >= 4.7 and cuda 5.5 From cda47c42c23035130488fd9a4437b2c1910d0bab Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 17 Jul 2017 21:08:20 +0200 Subject: [PATCH 006/680] Fix compilation in c++98 mode. --- Eigen/src/Core/MathFunctions.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index a906b7629..5ba5293a0 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1378,13 +1378,14 @@ T acos(const T &x) { return acos(x); } - +#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T &x) { EIGEN_USING_STD_MATH(acosh); return acosh(x); } +#endif #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); } @@ -1408,12 +1409,14 @@ T asin(const T &x) { return asin(x); } +#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T &x) { EIGEN_USING_STD_MATH(asinh); return asinh(x); } +#endif #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); } @@ -1437,12 +1440,14 @@ T atan(const T &x) { return atan(x); } +#if EIGEN_HAS_CXX11_MATH template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T &x) { EIGEN_USING_STD_MATH(atanh); return atanh(x); } +#endif #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); } From 55d7181557aebe43fb009ff8669f32741e15cfba Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Jul 2017 09:47:28 +0200 Subject: [PATCH 007/680] Fix lazyness of operator* with CUDA --- Eigen/src/Core/GeneralProduct.h | 21 +++++++++++---------- Eigen/src/Core/MatrixBase.h | 9 --------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 95dadfea8..dec24848d 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -24,12 +24,17 @@ template struct product_type_selector; template struct product_size_category { - enum { is_large = MaxSize == Dynamic || - Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || - (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), - value = is_large ? Large - : Size == 1 ? 1 - : Small + enum { + #ifndef EIGEN_CUDA_ARCH + is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + #else + is_large = 0, + #endif + value = is_large ? Large + : Size == 1 ? 1 + : Small }; }; @@ -379,8 +384,6 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef EIGEN_CUDACC - template template inline const Product @@ -412,8 +415,6 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // EIGEN_CUDACC - /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * * The returned product will behave like any other expressions: the coefficients of the product will be diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 908d35dfe..5a2146455 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -160,20 +160,11 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef EIGEN_CUDACC template EIGEN_DEVICE_FUNC - const Product - operator*(const MatrixBase &other) const - { return this->lazyProduct(other); } -#else - - template const Product operator*(const MatrixBase &other) const; -#endif - template EIGEN_DEVICE_FUNC const Product From d580a90c9ab5ed5521a79670f73bcea5ee755fe0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Jul 2017 10:03:54 +0200 Subject: [PATCH 008/680] Disable BDCSVD preallocation check. --- test/bdcsvd.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index f9f687aac..6c7b09696 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -104,7 +104,8 @@ void test_bdcsvd() CALL_SUBTEST_7( BDCSVD(10,10) ); // Check that preallocation avoids subsequent mallocs - CALL_SUBTEST_9( svd_preallocate() ); + // Disbaled because not supported by BDCSVD + // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); } From 1f4b24d2df6200c1074c2fca45d3905a33c54a3b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Jul 2017 10:13:48 +0200 Subject: [PATCH 009/680] Do not preallocate more space than the matrix size (when the sparse matrix boils down to a vector --- Eigen/src/SparseCore/SparseAssign.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h index 18352a847..113463258 100644 --- a/Eigen/src/SparseCore/SparseAssign.h +++ b/Eigen/src/SparseCore/SparseAssign.h @@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::max)(src.rows(),src.cols())*2); + dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); for (Index j=0; j Date: Thu, 17 Aug 2017 11:51:22 +0200 Subject: [PATCH 010/680] Make NoAlias and JacobiRotation compatible with CUDA. --- Eigen/src/Core/MatrixBase.h | 4 +++- Eigen/src/Core/NoAlias.h | 1 + Eigen/src/Jacobi/Jacobi.h | 19 +++++++++++++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 5a2146455..5786f76e0 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -296,7 +296,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias noalias(); + NoAlias EIGEN_DEVICE_FUNC noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -428,8 +428,10 @@ template class MatrixBase ///////// Jacobi module ///////// template + EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template + EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 33908010b..41fae5096 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -33,6 +33,7 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index c30326e1d..75595517a 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -37,17 +37,20 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ + EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ + EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - Scalar& c() { return m_c; } - Scalar c() const { return m_c; } - Scalar& s() { return m_s; } - Scalar s() const { return m_s; } + EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } + EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } + EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } + EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } /** Concatenates two planar rotation */ + EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -56,19 +59,26 @@ template class JacobiRotation } /** Returns the transposed transformation */ + EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ + EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template + EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); + EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0); protected: + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type); + EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type); Scalar m_c, m_s; @@ -264,6 +274,7 @@ namespace internal { * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } From 89c01a494aff2bd03b48a9858eed95a4a7ce9556 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 11:55:00 +0200 Subject: [PATCH 011/680] Add unit test for has_ReturnType --- test/meta.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/meta.cpp b/test/meta.cpp index b8dea68e8..bd505762e 100644 --- a/test/meta.cpp +++ b/test/meta.cpp @@ -15,6 +15,10 @@ bool check_is_convertible(const From&, const To&) return internal::is_convertible::value; } +struct FooReturnType { + typedef int ReturnType; +}; + void test_meta() { VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value)); @@ -75,6 +79,11 @@ void test_meta() VERIFY((!check_is_convertible(A*B, f) )); VERIFY(( check_is_convertible(A*B, A) )); } + + VERIFY(( internal::has_ReturnType::value )); + VERIFY(( internal::has_ReturnType >::value )); + VERIFY(( !internal::has_ReturnType::value )); + VERIFY(( !internal::has_ReturnType::value )); VERIFY(internal::meta_sqrt<1>::ret == 1); #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt::ret == int(std::sqrt(double(X)))) From b95f92843c58a914c46ab091009146288b8b775c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 12:07:10 +0200 Subject: [PATCH 012/680] Fix support for MKL's BLAS when using MKL_DIRECT_CALL. --- .../GeneralMatrixMatrixTriangular_BLAS.h | 8 +++- .../Core/products/GeneralMatrixMatrix_BLAS.h | 19 +++++--- .../Core/products/GeneralMatrixVector_BLAS.h | 19 +++++--- .../products/SelfadjointMatrixMatrix_BLAS.h | 48 ++++++++++++------- .../products/SelfadjointMatrixVector_BLAS.h | 9 +++- .../products/TriangularMatrixMatrix_BLAS.h | 39 ++++++++++----- .../products/TriangularMatrixVector_BLAS.h | 46 +++++++++++------- .../products/TriangularSolverMatrix_BLAS.h | 40 ++++++++++------ Eigen/src/Core/util/MKL_support.h | 8 ++-- 9 files changed, 157 insertions(+), 79 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 41e18ff07..9176a1382 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index e3a5d5892..6e36c2b3c 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float) EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template \ struct general_matrix_vector_product_gemv \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index a45238d69..9a5318507 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -81,13 +81,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -144,20 +144,26 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -197,13 +203,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -259,15 +265,21 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 38f23accf..1238345e3 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif } // end namespace internal diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6bb..a25197ab0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm \ @@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 88c0fb794..f0775116a 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -38,7 +38,7 @@ namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \ template \ struct triangular_solve_matrix \ { \ @@ -80,18 +80,24 @@ struct triangular_solve_matrix \ struct triangular_solve_matrix \ { \ @@ -133,16 +139,22 @@ struct triangular_solve_matrix /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ @@ -108,6 +109,10 @@ #endif #endif +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" +#endif + namespace Eigen { typedef std::complex dcomplex; @@ -121,8 +126,5 @@ typedef int BlasIndex; } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) -#include "../../misc/blas.h" -#endif #endif // EIGEN_MKL_SUPPORT_H From 8c858bd8919936f250d2e7b090c0d17f00dbb85b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 12:17:45 +0200 Subject: [PATCH 013/680] Clarify doc regarding the usage of MKL_DIRECT_CALL --- Eigen/src/Core/util/MKL_support.h | 2 +- doc/UsingIntelMKL.dox | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index e656799bf..b7d6ecc76 100755 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -49,7 +49,7 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL) #define EIGEN_USE_MKL #endif diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index a1a3a18f2..53e5de42c 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -63,6 +63,8 @@ In addition you can choose which parts will be substituted by defining one or mu \c EIGEN_USE_MKL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML +In order to be able to use \b MKL_DIRECT_CALL for BLAS level 2 and 3 routines, then the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. This is needed to tell Eigen that the BLAS backend is the MKL and that the MKL interface must be used instead of the generic F77 one. + Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details. Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module. From f727844658f8c9c01302b5cb08d81c62c572b82b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 21:58:39 +0200 Subject: [PATCH 014/680] use MKL's lapacke.h header when using MKL --- Eigen/Cholesky | 4 ++++ Eigen/Eigenvalues | 4 ++++ Eigen/LU | 4 ++++ Eigen/QR | 4 ++++ Eigen/SVD | 4 ++++ 5 files changed, 20 insertions(+) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index 369d1f5ec..bb899a2ce 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -31,7 +31,11 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Cholesky/LLT_LAPACKE.h" #endif diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues index 009e529e1..f3f661b07 100644 --- a/Eigen/Eigenvalues +++ b/Eigen/Eigenvalues @@ -45,7 +45,11 @@ #include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" diff --git a/Eigen/LU b/Eigen/LU index 6f6c55629..6418a86e1 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -28,7 +28,11 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" diff --git a/Eigen/QR b/Eigen/QR index 80838e3bd..c7e914469 100644 --- a/Eigen/QR +++ b/Eigen/QR @@ -36,7 +36,11 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif diff --git a/Eigen/SVD b/Eigen/SVD index 86143c23d..5d0e75f7f 100644 --- a/Eigen/SVD +++ b/Eigen/SVD @@ -37,7 +37,11 @@ #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/SVD/JacobiSVD_LAPACKE.h" #endif From 2810ba194be85af0012f786e6c032b2bfe432be9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 17 Aug 2017 22:12:26 +0200 Subject: [PATCH 015/680] Clarify MKL_DIRECT_CALL doc. --- doc/UsingIntelMKL.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index 53e5de42c..6de14afad 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -63,7 +63,7 @@ In addition you can choose which parts will be substituted by defining one or mu \c EIGEN_USE_MKL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML -In order to be able to use \b MKL_DIRECT_CALL for BLAS level 2 and 3 routines, then the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. This is needed to tell Eigen that the BLAS backend is the MKL and that the MKL interface must be used instead of the generic F77 one. +The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details. From e6021cc8cc6298196026119e8486c67ea2604376 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 11:32:55 +0200 Subject: [PATCH 016/680] bug #1458: fix documentation of LLT and LDLT info() method. --- Eigen/src/Cholesky/LDLT.h | 2 +- Eigen/src/Cholesky/LLT.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 9b4fdb414..968427b3a 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -248,7 +248,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const { diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index e6c02d803..3b0b9faf4 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -177,7 +177,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const { From a6e7a41a553d3663cefc45a5d2b509494d8adb37 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 11:37:32 +0200 Subject: [PATCH 017/680] bug #1455: Cholesky module depends on Jacobi for rank-updates. --- Eigen/Cholesky | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index bb899a2ce..1332b540d 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -9,6 +9,7 @@ #define EIGEN_CHOLESKY_MODULE_H #include "Core" +#include "Jacobi" #include "src/Core/util/DisableStupidWarnings.h" From 2c3d70d915a939d0da33ca22742a26c271adcb82 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 12:04:09 +0200 Subject: [PATCH 018/680] Re-enable hidden doc in LLT --- Eigen/src/Cholesky/LLT.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 3b0b9faf4..3de94f726 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -24,7 +24,7 @@ template struct LLT_Traits; * * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. - * The other triangular part won't be read. + * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * matrix A such that A = LL^* = U^*U, where L is lower triangular. @@ -43,12 +43,11 @@ template struct LLT_Traits; * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * + * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Therefore, the strict lower part does not have to store correct values. + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ - /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) - * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, - * the strict lower part does not have to store correct values. - */ template class LLT { public: From 21d0a0bcf5eef2fb89f1ca48b65d52ec03e97272 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 12:46:35 +0200 Subject: [PATCH 019/680] bug #1456: add perf recommendation for LLT and storage format --- Eigen/src/Cholesky/LLT.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 3de94f726..52d101aff 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -41,6 +41,11 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * + * \b Performance: for best performance, it is recommended to use a column-major storage format + * with the Lower triangular part (the default), or, equivalently, a row-major storage format, + * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization + * step, and rank-updates can be up to 3 times slower. + * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. From e27f17bf5c921dca73b4d2dc1a90863b36292fdc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 13:27:37 +0200 Subject: [PATCH 020/680] Gub 1453: fix Map with non-default inner-stride but no outer-stride. --- Eigen/src/Cholesky/LLT.h | 2 +- Eigen/src/Core/Map.h | 15 ++++++++--- test/mapstride.cpp | 57 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 52d101aff..7f29dad9c 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -42,7 +42,7 @@ template struct LLT_Traits; * Output: \verbinclude LLT_example.out * * \b Performance: for best performance, it is recommended to use a column-major storage format - * with the Lower triangular part (the default), or, equivalently, a row-major storage format, + * with the Lower triangular part (the default), or, equivalently, a row-major storage format * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization * step, and rank-updates can be up to 3 times slower. * diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 06d196702..7ca6a9280 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -20,11 +20,17 @@ struct traits > { typedef traits TraitsBase; enum { + PlainObjectTypeInnerSize = ((traits::Flags&RowMajorBit)==RowMajorBit) + ? PlainObjectType::ColsAtCompileTime + : PlainObjectType::RowsAtCompileTime, + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) : int(StrideType::InnerStrideAtCompileTime), OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) + ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic + ? Dynamic + : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize)) : int(StrideType::OuterStrideAtCompileTime), Alignment = int(MapOptions)&int(AlignedMask), Flags0 = TraitsBase::Flags & (~NestByRefBit), @@ -108,9 +114,10 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : IsVectorAtCompileTime ? this->size() - : int(Flags)&RowMajorBit ? this->cols() - : this->rows(); + : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime + : IsVectorAtCompileTime ? (this->size() * innerStride()) + : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) + : (this->rows() * innerStride()); } /** Constructor in the fixed-size case. diff --git a/test/mapstride.cpp b/test/mapstride.cpp index 4858f8fea..de77dc5de 100644 --- a/test/mapstride.cpp +++ b/test/mapstride.cpp @@ -58,7 +58,7 @@ template void map_class_matrix(const MatrixTy MatrixType m = MatrixType::Random(rows,cols); Scalar s1 = internal::random(); - Index arraysize = 2*(rows+4)*(cols+4); + Index arraysize = 4*(rows+4)*(cols+4); Scalar* a_array1 = internal::aligned_new(arraysize+1); Scalar* array1 = a_array1; @@ -143,9 +143,62 @@ template void map_class_matrix(const MatrixTy VERIFY_IS_APPROX(map,s1*m); } + // test inner stride and no outer stride + for(int k=0; k<2; ++k) + { + if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2) + break; + Scalar* array = (k==0 ? array1 : array2); + + Map > map(array, rows, cols, InnerStride(2)); + map = m; + VERIFY(map.outerStride() == map.innerSize()*2); + for(int i = 0; i < m.outerSize(); ++i) + for(int j = 0; j < m.innerSize(); ++j) + { + VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j)); + VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + } + VERIFY_IS_APPROX(s1*map,s1*m); + map *= s1; + VERIFY_IS_APPROX(map,s1*m); + } + internal::aligned_delete(a_array1, arraysize+1); } +// Additional tests for inner-stride but no outer-stride +template +void bug1453() +{ + const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + typedef Matrix RowMatrixXi; + typedef Matrix ColMatrix23i; + typedef Matrix ColMatrix32i; + typedef Matrix RowMatrix23i; + typedef Matrix RowMatrix32i; + + VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + + VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + + VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); +} + void test_mapstride() { for(int i = 0; i < g_repeat; i++) { @@ -175,6 +228,8 @@ void test_mapstride() CALL_SUBTEST_5( map_class_matrix(MatrixXi(internal::random(1,maxn),internal::random(1,maxn))) ); CALL_SUBTEST_6( map_class_matrix(MatrixXcd(internal::random(1,maxn),internal::random(1,maxn))) ); CALL_SUBTEST_6( map_class_matrix(MatrixXcd(internal::random(1,maxn),internal::random(1,maxn))) ); + + CALL_SUBTEST_5( bug1453<0>() ); TEST_SET_BUT_UNUSED_VARIABLE(maxn); } From be281e528967ed00ed52f50a476beef10ff0dec3 Mon Sep 17 00:00:00 2001 From: Jim Radford Date: Wed, 4 Jan 2017 14:43:56 -0800 Subject: [PATCH 021/680] LLT: avoid making a copy when decomposing in place --- Eigen/src/Cholesky/LLT.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 7f29dad9c..c7683232d 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -428,7 +428,8 @@ LLT& LLT::compute(const EigenBase eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); - m_matrix = a.derived(); + if (!internal::is_same_dense(m_matrix, a.derived())) + m_matrix = a.derived(); // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); From 0c226644d8cf21d35cfcf46c60ce66d2183f530e Mon Sep 17 00:00:00 2001 From: Jim Radford Date: Wed, 4 Jan 2017 14:42:57 -0800 Subject: [PATCH 022/680] LLT: const the arg to solveInPlace() to allow passing .transpose(), .block(), etc. --- Eigen/src/Cholesky/LLT.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index c7683232d..152837bfd 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -150,7 +150,7 @@ template class LLT } template - void solveInPlace(MatrixBase &bAndX) const; + void solveInPlace(const MatrixBase &bAndX) const; template LLT& compute(const EigenBase& matrix); @@ -493,7 +493,7 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const */ template template -void LLT::solveInPlace(MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); From b1e312edd607bcfa99192d53f55b2ac974644c44 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 15 Feb 2017 10:13:01 +0000 Subject: [PATCH 023/680] Adding TensorPatch.h for sycl backend. --- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 11 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_patch_sycl.cpp | 252 ++++++++++++++++++ 3 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_patch_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 886a254f6..cead2eac8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -99,11 +99,11 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), patch_dims(op.patch_dims()) { Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const PatchDim& patch_dims = op.patch_dims(); + // const PatchDim& patch_dims = op.patch_dims(); if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims-1; ++i) { m_dimensions[i] = patch_dims[i]; @@ -255,6 +255,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return patch_dims; } + protected: Dimensions m_dimensions; array m_outputStrides; @@ -262,6 +267,8 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; + // required by sycl + const PatchDim& patch_dims; }; } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 003c9de0b..d01233fb2 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -167,6 +167,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp new file mode 100644 index 000000000..b75219a5b --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -0,0 +1,252 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include + +using Eigen::Tensor; + +template +static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array patchTensorRange; + if (DataLayout == ColMajor) { + patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}}; + }else{ + patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}}; + } + + Tensor tensor(tensorRange); + Tensor no_patch(patchTensorRange); + + tensor.setRandom(); + + array patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_no_patch(gpu_data_no_patch, patchTensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + } else { + VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size()); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), 1); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + if (DataLayout == ColMajor) { + patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}}; + }else{ + patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}}; + } + Tensor single_patch(patchTensorRange); + patchTensorBuffSize =single_patch.size()*sizeof(DataType); + DataType* gpu_data_single_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch(gpu_data_single_patch, patchTensorRange); + + gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + } else { + VERIFY_IS_EQUAL(single_patch.dimension(0), 1); + VERIFY_IS_EQUAL(single_patch.dimension(1), 2); + VERIFY_IS_EQUAL(single_patch.dimension(2), 3); + VERIFY_IS_EQUAL(single_patch.dimension(3), 5); + VERIFY_IS_EQUAL(single_patch.dimension(4), 7); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + + + + + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,2,1,2*2*4*7}}; + }else{ + patchTensorRange = {{2*2*4*7, 1, 2,2,1}}; + } + Tensor twod_patch(patchTensorRange); + patchTensorBuffSize =twod_patch.size()*sizeof(DataType); + DataType* gpu_data_twod_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange); + + gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + } else { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 1); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + } else { + patch_loc = l + 7 * (k + 4 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0)); + } + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,3,5,2*2*3*3}}; + }else{ + patchTensorRange = {{2*2*3*3, 1, 2,3,5}}; + } + Tensor threed_patch(patchTensorRange); + patchTensorBuffSize =threed_patch.size()*sizeof(DataType); + DataType* gpu_data_threed_patch = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange); + + gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + } else { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 5); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + } else { + patch_loc = l + 3 * (k + 3 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z)); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_patch); + sycl_device.deallocate(gpu_data_single_patch); + sycl_device.deallocate(gpu_data_twod_patch); + sycl_device.deallocate(gpu_data_threed_patch); +} + +template void sycl_tensor_patch_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_patch_sycl(sycl_device); + test_simple_patch_sycl(sycl_device); +} +void test_cxx11_tensor_patch_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_patch_test_per_device(device)); + } +} From 91982b91c02deb5e1ce557bbc5c96fee19c636ed Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 15 Feb 2017 16:28:12 +0000 Subject: [PATCH 024/680] Adding TensorLayoutSwapOp for sycl. --- .../TensorSyclConvertToDeviceExpression.h | 31 +++-- .../src/Tensor/TensorSyclExprConstructor.h | 21 ++- .../src/Tensor/TensorSyclExtractAccessor.h | 15 +++ .../src/Tensor/TensorSyclExtractFunctors.h | 27 ++-- .../CXX11/src/Tensor/TensorSyclLeafCount.h | 14 +- .../src/Tensor/TensorSyclPlaceHolderExpr.h | 17 ++- unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_layout_swap_sycl.cpp | 126 ++++++++++++++++++ unsupported/test/cxx11_tensor_patch_sycl.cpp | 9 +- 9 files changed, 217 insertions(+), 44 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_layout_swap_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ee8f3c9c2..ff5097141 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -91,28 +91,35 @@ ASSIGNCONVERT(, false) #undef ASSIGNCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node -/// type is either TensorForcedEvalOp or TensorEvalToOp +/// type is TensorEvalToOp #define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ template \ struct ConvertToDeviceExpression > \ : DeviceConvertor{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp -#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ -template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression::Type> Type;\ -}; -KERNELBROKERCONVERTFORCEDEVAL(const) -KERNELBROKERCONVERTFORCEDEVAL() -#undef KERNELBROKERCONVERTFORCEDEVAL - - KERNELBROKERCONVERT(const, true, TensorEvalToOp) KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT +/// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\ +template \ +struct ConvertToDeviceExpression > {\ + typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ +}; + +// TensorForcedEvalOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) + +// TensorLayoutSwapOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP + + + /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ template class MakePointer_>\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 3b83b1d2c..6b6093fa3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -223,7 +223,7 @@ struct ExprConstructor, CVQua Type expr;\ template \ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ + : nestedExpression(funcD.xprExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\ }; EVALTO(const) @@ -386,6 +386,25 @@ SYCLTENSORCHIPPINGOPEXPR() #undef SYCLTENSORCHIPPINGOPEXPR + +// TensorLayoutSwapOp +#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ +template\ +struct ExprConstructor , CVQual TensorLayoutSwapOp, Params... >{\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorLayoutSwapOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ +}; + +SYCLTENSORLAYOUTSWAPOPEXPR(const) +SYCLTENSORLAYOUTSWAPOPEXPR() +#undef SYCLTENSORLAYOUTSWAPOPEXPR + + /// template deduction for \ref ExprConstructor struct template auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple &t) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index b512d43f6..213dd25ea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -226,6 +226,21 @@ SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorLayoutSwapOp. +#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORLAYOUTSWAPOPEXTACC(const) +SYCLTENSORLAYOUTSWAPOPEXTACC() +#undef SYCLTENSORLAYOUTSWAPOPEXTACC + + + /// template deduction for \ref ExtractAccessor template auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index ee020184b..1506e8189 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -39,7 +39,6 @@ template struct FunctorExtractor{ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } FunctorExtractor(const Evaluator& expr) : m_dimensions(expr.dimensions()) {} - }; /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything @@ -143,19 +142,23 @@ SYCLEXTRFUNCASSIGNOP(const) SYCLEXTRFUNCASSIGNOP() #undef SYCLEXTRFUNCASSIGNOP -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorEvalToOp, This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOP(CVQual)\ -template \ -struct FunctorExtractor, Dev> > {\ - FunctorExtractor > rhsExpr;\ - FunctorExtractor(const TensorEvaluator, Dev>& expr)\ - : rhsExpr(expr.impl()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node types are +/// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\ +template \ +struct FunctorExtractor, Dev> > {\ + FunctorExtractor > xprExpr;\ + FunctorExtractor(const TensorEvaluator, Dev>& expr)\ + : xprExpr(expr.impl()) {}\ }; +//TensorEvalToOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp) +// TensorLayoutSwapOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp) -SYCLEXTRFUNCEVALTOOP(const) -SYCLEXTRFUNCEVALTOOP() -#undef SYCLEXTRFUNCEVALTOOP +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT template struct DimConstr { template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index a1c112f4d..15729310d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -94,15 +94,17 @@ SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLEAFCOUNT(CVQual)\ +#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ template \ -struct LeafCount > {\ - static const size_t Count = 1 + CategoryCount::Count;\ +struct LeafCount > {\ + static const size_t Count = Num + CategoryCount::Count;\ }; -EVALTOLEAFCOUNT(const) -EVALTOLEAFCOUNT() -#undef EVALTOLEAFCOUNT +EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0) +#undef EVALTOLAYOUTSWAPLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp #define REDUCTIONLEAFCOUNT(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 74566dcee..ba0d17e0c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -144,16 +144,19 @@ FORCEDEVAL() #undef FORCEDEVAL /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp -#define EVALTO(CVQual)\ +/// TensorEvalToOp, TensorLayoutSwapOp +#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ template \ -struct PlaceHolderExpression, N> {\ - typedef CVQual TensorEvalToOp::ArgType> Type;\ +struct PlaceHolderExpression, N> {\ + typedef CVQual ExprNode::ArgType> Type;\ }; -EVALTO(const) -EVALTO() -#undef EVALTO +EVALTOLAYOUTSWAP(const, TensorEvalToOp) +EVALTOLAYOUTSWAP(, TensorEvalToOp) +EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAP(, TensorLayoutSwapOp) + +#undef EVALTOLAYOUTSWAP /// specialisation of the \ref PlaceHolderExpression when the node is diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d01233fb2..57580f805 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -168,6 +168,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp new file mode 100644 index 000000000..9e8db8b4b --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_layout_swap_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include + +using Eigen::Tensor; + +template +static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + + Tensor tensor1(tensorColRange); + Tensor tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorColRange); + TensorMap> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor tensor(2,3,7); + //tensor.setRandom(); + +// Tensor tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template +static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + Tensor tensor1(tensorColRange); + Tensor tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorColRange); + TensorMap> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.swap_layout().device(sycl_device)=gpu1; + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor tensor(2,3,7); +// tensor.setRandom(); + + //Tensor tensor2(7,3,2); +// tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + + +template void sycl_tensor_layout_swap_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_swap_sycl(sycl_device); + test_swap_as_lvalue_sycl(sycl_device); +} +void test_cxx11_tensor_layout_swap_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp index b75219a5b..88a29cb31 100644 --- a/unsupported/test/cxx11_tensor_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -12,7 +12,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl @@ -80,10 +79,12 @@ static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ for (int i = 0; i < tensor.size(); ++i) { VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); } + patch_dims[0] = 2; patch_dims[1] = 3; patch_dims[2] = 5; patch_dims[3] = 7; + if (DataLayout == ColMajor) { patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}}; }else{ @@ -114,15 +115,11 @@ static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ for (int i = 0; i < tensor.size(); ++i) { VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); } - - - - - patch_dims[0] = 1; patch_dims[1] = 2; patch_dims[2] = 2; patch_dims[3] = 1; + if (DataLayout == ColMajor) { patchTensorRange = {{1,2,2,1,2*2*4*7}}; }else{ From 79ebc8f76137f151c78b4f61cd99fae62bf6c34f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 20 Feb 2017 12:11:05 +0000 Subject: [PATCH 025/680] Adding Sycl backend for TensorImagePatchOP.h; adding Sycl backend for TensorInflation.h. --- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 26 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 6 + .../TensorSyclConvertToDeviceExpression.h | 14 + .../src/Tensor/TensorSyclExprConstructor.h | 18 + .../src/Tensor/TensorSyclExtractAccessor.h | 14 + .../src/Tensor/TensorSyclExtractFunctors.h | 36 +- .../CXX11/src/Tensor/TensorSyclLeafCount.h | 10 + .../src/Tensor/TensorSyclPlaceHolderExpr.h | 14 + unsupported/test/CMakeLists.txt | 2 + .../test/cxx11_tensor_image_patchOP_sycl.cpp | 1092 +++++++++++++++++ .../test/cxx11_tensor_inflation_sycl.cpp | 136 ++ 11 files changed, 1356 insertions(+), 12 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_inflation_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 566856ed2..2fb6b84b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -70,12 +70,8 @@ class TensorImagePatchOp : public TensorBase, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_op(op) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -241,6 +238,8 @@ struct TensorEvaluator, Device> break; default: eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warnig; + m_outputRows=0; //// silence the uninitialised warnig; } } eigen_assert(m_outputRows > 0); @@ -420,7 +419,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } + Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -501,6 +503,8 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; TensorEvaluator m_impl; + // required for sycl + const XprType& m_op; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f391fb9ee..b6bf05fed 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -215,6 +215,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } + + protected: Dimensions m_dimensions; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ff5097141..5b4a9af9f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -165,6 +165,20 @@ KERNELBROKERCONVERTCHIPPINGOP() + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp +#define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ +template\ +struct ConvertToDeviceExpression >{\ + typedef CVQual TensorImagePatchOp::Type> Type;\ +}; +KERNELBROKERCONVERTIMAGEPATCHOP(const) +KERNELBROKERCONVERTIMAGEPATCHOP() +#undef KERNELBROKERCONVERTIMAGEPATCHOP + + + + } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 6b6093fa3..57a10d06b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -385,6 +385,24 @@ SYCLTENSORCHIPPINGOPEXPR(const) SYCLTENSORCHIPPINGOPEXPR() #undef SYCLTENSORCHIPPINGOPEXPR +// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOPEXPR(CVQual)\ +template\ +struct ExprConstructor, CVQual TensorImagePatchOp, Params... > {\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorImagePatchOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value, funcD.m_padding_type, funcD.m_padding_explicit){}\ +}; + +SYCLTENSORIMAGEPATCHOPEXPR(const) +SYCLTENSORIMAGEPATCHOPEXPR() +#undef SYCLTENSORIMAGEPATCHOPEXPR // TensorLayoutSwapOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 213dd25ea..2be6f3710 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -226,6 +226,20 @@ SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorImagePatchOp. +#define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORIMAGEPATCHOPEXTACC(const) +SYCLTENSORIMAGEPATCHOPEXTACC() +#undef SYCLTENSORIMAGEPATCHOPEXTACC + + // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorLayoutSwapOp. #define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 1506e8189..dbac01138 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -296,7 +296,7 @@ SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) //TensorChippingOp #define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ template\ -struct FunctorExtractor, Device>>{\ +struct FunctorExtractor, Device> >{\ FunctorExtractor > xprExpr;\ const DenseIndex m_dim;\ const DenseIndex m_offset;\ @@ -310,6 +310,40 @@ SYCLEXTRFUNCCHIPPINGOP(const) SYCLEXTRFUNCCHIPPINGOP() #undef SYCLEXTRFUNCCHIPPINGOP +#define SYCLEXTRFUNCIMAGEPATCHOP(CVQual)\ +template\ +struct FunctorExtractor, Device> >{\ +typedef CVQual TensorImagePatchOp Self;\ +FunctorExtractor > xprExpr;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator& expr)\ +: xprExpr(expr.impl()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_row_inflate_strides(expr.xpr().row_inflate_strides()), m_col_inflate_strides(expr.xpr().col_inflate_strides()),\ + m_padding_explicit(expr.xpr().padding_explicit()),m_padding_top(expr.xpr().padding_top()),\ + m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),\ + m_padding_value(expr.xpr().padding_value()){}\ +}; + +SYCLEXTRFUNCIMAGEPATCHOP(const) +SYCLEXTRFUNCIMAGEPATCHOP() +#undef SYCLEXTRFUNCIMAGEPATCHOP /// template deduction function for FunctorExtractor template auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 15729310d..b8e658824 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -161,6 +161,16 @@ SLICESTRIDEOPLEAFCOUNT() #undef SLICESTRIDEOPLEAFCOUNT +#define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ +template\ +struct LeafCount >:CategoryCount{}; + + +TENSORIMAGEPATCHOPLEAFCOUNT(const) +TENSORIMAGEPATCHOPLEAFCOUNT() +#undef TENSORIMAGEPATCHOPLEAFCOUNT + + } /// namespace TensorSycl } /// namespace internal } /// namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index ba0d17e0c..ab97235ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -221,6 +221,20 @@ SYCLSLICESTRIDEOPPLH() #undef SYCLSLICESTRIDEOPPLH + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorImagePatchOp +#define SYCLTENSORIMAGEPATCHOP(CVQual)\ +template\ +struct PlaceHolderExpression, N> {\ + typedef CVQual TensorImagePatchOp::ArgType> Type;\ +}; + +SYCLTENSORIMAGEPATCHOP(const) +SYCLTENSORIMAGEPATCHOP() +#undef SYCLTENSORIMAGEPATCHOP + + /// template deduction for \ref PlaceHolderExpression struct template struct createPlaceHolderExpression { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 57580f805..282f9eb55 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -169,6 +169,8 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp new file mode 100644 index 000000000..ba6b2f15a --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp @@ -0,0 +1,1092 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_image_patchOP_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template +static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Single pixel patch: ColMajor + array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}}; + Tensor single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7); + + // Single pixel patch: RowMajor + array patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index colmajor " << i << " : " + << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] + << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index row major" << i << " : " + << tensor_row_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}}; + Tensor entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7); + + // Entire image patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; + Tensor entire_image_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); + gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b); + expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(b, patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}}; + Tensor twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2); + + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + } + if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major); + + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) { + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + + } + if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); + +} + + +// Verifies VALID padding (no padding) with incrementing values. +template +static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 3; + IndexType input_cols = 3; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + // ColMajor + array patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}}; + Tensor result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_result_col_major); + sycl_device.deallocate(gpu_data_result_row_major); +} + +// Verifies VALID padding (no padding) with the same value. +template +static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 1; + IndexType input_rows = 5; + IndexType input_cols = 5; + IndexType input_batches = 2; + IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + // ColMajor + + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + array patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}}; + Tensor result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + +// Verifies SAME padding. +template +static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 4; + IndexType input_cols = 2; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + // ColMajor + array tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + +array patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}}; +Tensor result_col_major(patchColMajorTensorRange); +size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); +DataType* gpu_data_result_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); +TensorMap> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); +gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); +sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + + array patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }}; + Tensor result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + + +template +static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + + // ColMajor + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}}; + Tensor tensor_col_major(tensorColMajorRange); + tensor_col_major.setRandom(); + Tensor tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0)); + + + // Single pixel patch: ColMajor + array patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}}; + Tensor single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3); + + // Single pixel patch: RowMajor + array patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " + << tensor_col_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}}; + Tensor entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + + // Entire image patch: RowMajor +patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; +Tensor entire_image_patch_row_major(patchRowMajorTensorRange); +patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); +DataType* gpu_data_entire_image_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); +TensorMap> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); +gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); +sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j); + expected_row_major = tensor_row_major(c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}}; + Tensor twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset); + } + if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) { + expected_row_major = tensor_row_major(col_offset, row_offset, d); + } + if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); +} + +template +static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + // ColMajor + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 128; + IndexType sizeDim3 = 128; + IndexType sizeDim4 = 16; + array tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor l_in_col_major(tensorColMajorRange); + l_in_col_major.setRandom(); + + DataType* gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + + array patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}}; + Tensor l_out_col_major(patchTensorRange); + size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange); + gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4); + + // RowMajor + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}}; + Tensor l_out_row_major(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1); + + for (IndexType b = 0; b < 16; ++b) { + for (IndexType i = 0; i < 128; ++i) { + for (IndexType j = 0; j < 128; ++j) { + int patchId = i+128*j; + for (IndexType c = 0; c < 11; ++c) { + for (IndexType r = 0; r < 11; ++r) { + for (IndexType d = 0; d < 3; ++d) { + DataType expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in_col_major(d, r-5+i, c-5+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != + expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), + expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 16; + sizeDim2 = 64; + sizeDim3 = 64; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + +// RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 64; ++i) { + for (IndexType j = 0; j < 64; ++j) { + int patchId = i+64*j; + for (IndexType c = 0; c < 9; ++c) { + for (IndexType r = 0; r < 9; ++r) { + for (IndexType d = 0; d < 16; ++d) { + DataType expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in_col_major(d, r-4+i, c-4+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 32; + sizeDim2 = 16; + sizeDim3 = 16; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 16; ++i) { + for (IndexType j = 0; j < 16; ++j) { + int patchId = i+16*j; + for (IndexType c = 0; c < 7; ++c) { + for (IndexType r = 0; r < 7; ++r) { + for (IndexType d = 0; d < 32; ++d) { + DataType expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in_col_major(d, r-3+i, c-3+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 64; + sizeDim2 = 13; + sizeDim3 = 13; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 13; ++i) { + for (IndexType j = 0; j < 13; ++j) { + int patchId = i+13*j; + for (IndexType c = 0; c < 3; ++c) { + for (IndexType r = 0; r < 3; ++r) { + for (IndexType d = 0; d < 64; ++d) { + DataType expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in_col_major(d, r-1+i, c-1+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sycl_device.deallocate(gpu_data_l_out_row_major); +} + + +template void sycl_tensor_image_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +test_simple_image_patch_sycl(sycl_device); +test_patch_padding_valid_sycl(sycl_device); +test_patch_padding_valid_same_value_sycl(sycl_device); +test_patch_padding_same_sycl(sycl_device); +test_patch_no_extra_dim_sycl(sycl_device); +test_imagenet_patches_sycl(sycl_device); +} +void test_cxx11_tensor_image_patchOP_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_image_patch_test_per_device(device)); +} +} diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp new file mode 100644 index 000000000..f2f87f7ed --- /dev/null +++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp @@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_inflation_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; + +// Inflation Defenition for each dimention the inflated val would be +//((dim-1)*strid[dim] +1) + +// for 1 dimnention vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to +// tensor of size (2*3) +1 = 7 with the value of +// (4, 0, 0, 4, 0, 0, 4). + +template +void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) { + + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor tensor(tensorRange); + Tensor no_stride(tensorRange); + tensor.setRandom(); + + array strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_stride = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_no_stride(gpu_data_no_stride, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize); + + VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + IndexType inflatedSizeDim1 = 3; + IndexType inflatedSizeDim2 = 9; + IndexType inflatedSizeDim3 = 9; + IndexType inflatedSizeDim4 = 19; + array inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}}; + + Tensor inflated(inflatedTensorRange); + + const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType); + DataType* gpu_data_inflated = static_cast(sycl_device.allocate(inflatedTensorBuffSize)); + TensorMap> gpu_inflated(gpu_data_inflated, inflatedTensorRange); + gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize); + + VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1); + VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2); + VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3); + VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4); + + for (IndexType i = 0; i < inflatedSizeDim1; ++i) { + for (IndexType j = 0; j < inflatedSizeDim2; ++j) { + for (IndexType k = 0; k < inflatedSizeDim3; ++k) { + for (IndexType l = 0; l < inflatedSizeDim4; ++l) { + if (i % strides[0] == 0 && + j % strides[1] == 0 && + k % strides[2] == 0 && + l % strides[3] == 0) { + VERIFY_IS_EQUAL(inflated(i,j,k,l), + tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3])); + } else { + VERIFY_IS_EQUAL(0, inflated(i,j,k,l)); + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_stride); + sycl_device.deallocate(gpu_data_inflated); +} + +template void sycl_inflation_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_inflation_sycl(sycl_device); + test_simple_inflation_sycl(sycl_device); +} +void test_cxx11_tensor_inflation_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_inflation_test_per_device(device)); + } +} From 4f07ac16b0722597c55e2783cee33606a1f5e390 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 21 Feb 2017 10:09:47 +0000 Subject: [PATCH 026/680] Reducing the number of warnings. --- .../test/cxx11_tensor_image_patchOP_sycl.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp index ba6b2f15a..e5ca4e388 100644 --- a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp +++ b/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp @@ -134,7 +134,7 @@ static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 3; ++r) { for (IndexType c = 0; c < 5; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -206,7 +206,7 @@ static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 2; ++r) { for (IndexType c = 0; c < 2; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -323,7 +323,7 @@ static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols - int patchId = i+input_rows*j; + IndexType patchId = i+input_rows*j; for (IndexType r = 0; r < ksize; ++r) { // patch rows for (IndexType c = 0; c < ksize; ++c) { // patch cols for (IndexType d = 0; d < input_depth; ++d) { // depth @@ -529,7 +529,7 @@ sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_majo for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols - int patchId = i+input_rows*j; + IndexType patchId = i+input_rows*j; for (IndexType r = 0; r < ksize; ++r) { // patch rows for (IndexType c = 0; c < ksize; ++c) { // patch cols for (IndexType d = 0; d < input_depth; ++d) { // depth @@ -667,7 +667,7 @@ sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_ent for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 3; ++r) { for (IndexType c = 0; c < 5; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -731,7 +731,7 @@ sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_ent for (IndexType i = 0; i < 3; ++i) { for (IndexType j = 0; j < 5; ++j) { - int patchId = i+3*j; + IndexType patchId = i+3*j; for (IndexType r = 0; r < 2; ++r) { for (IndexType c = 0; c < 2; ++c) { for (IndexType d = 0; d < 2; ++d) { @@ -824,7 +824,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 16; ++b) { for (IndexType i = 0; i < 128; ++i) { for (IndexType j = 0; j < 128; ++j) { - int patchId = i+128*j; + IndexType patchId = i+128*j; for (IndexType c = 0; c < 11; ++c) { for (IndexType r = 0; r < 11; ++r) { for (IndexType d = 0; d < 3; ++d) { @@ -899,7 +899,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 32; ++b) { for (IndexType i = 0; i < 64; ++i) { for (IndexType j = 0; j < 64; ++j) { - int patchId = i+64*j; + IndexType patchId = i+64*j; for (IndexType c = 0; c < 9; ++c) { for (IndexType r = 0; r < 9; ++r) { for (IndexType d = 0; d < 16; ++d) { @@ -972,7 +972,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 32; ++b) { for (IndexType i = 0; i < 16; ++i) { for (IndexType j = 0; j < 16; ++j) { - int patchId = i+16*j; + IndexType patchId = i+16*j; for (IndexType c = 0; c < 7; ++c) { for (IndexType r = 0; r < 7; ++r) { for (IndexType d = 0; d < 32; ++d) { @@ -1044,7 +1044,7 @@ static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType b = 0; b < 32; ++b) { for (IndexType i = 0; i < 13; ++i) { for (IndexType j = 0; j < 13; ++j) { - int patchId = i+13*j; + IndexType patchId = i+13*j; for (IndexType c = 0; c < 3; ++c) { for (IndexType r = 0; r < 3; ++r) { for (IndexType d = 0; d < 64; ++d) { From 89dfd51fae868393b66b1949638e03de2ba17c1f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 22 Feb 2017 16:36:24 +0000 Subject: [PATCH 027/680] Adding Sycl Backend for TensorGenerator.h. --- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 13 +- unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_generator_sycl.cpp | 147 ++++++++++++++++++ 3 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_generator_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index eb1d4934e..ca87f493a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -97,10 +97,9 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) + : m_generator(op.generator()), m_argImpl(op.expression(), device) { - TensorEvaluator impl(op.expression(), device); - m_dimensions = impl.dimensions(); + m_dimensions = m_argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; @@ -155,6 +154,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_argImpl; } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } + + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { @@ -178,6 +183,8 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; array m_strides; Generator m_generator; + // required by sycl + TensorEvaluator m_argImpl; }; } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 282f9eb55..69c892362 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -171,6 +171,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp new file mode 100644 index 000000000..f551c8d0c --- /dev/null +++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_generator_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL +static const float error_threshold =1e-8f; + +#include "main.h" +#include + +using Eigen::Tensor; +struct Generator1D { + Generator1D() { } + + float operator()(const array& coordinates) const { + return coordinates[0]; + } +}; + +template +static void test_1D_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 6; + array tensorRange = {{sizeDim1}}; + Tensor vec(tensorRange); + Tensor result(tensorRange); + + const size_t tensorBuffSize =vec.size()*sizeof(DataType); + DataType* gpu_data_vec = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_vec(gpu_data_vec, tensorRange); + TensorMap> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(result(i), i); + } +} + + +struct Generator2D { + Generator2D() { } + + float operator()(const array& coordinates) const { + return 3 * coordinates[0] + 11 * coordinates[1]; + } +}; + +template +static void test_2D_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 7; + array tensorRange = {{sizeDim1, sizeDim2}}; + Tensor matrix(tensorRange); + Tensor result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 5; ++i) { + for (IndexType j = 0; j < 5; ++j) { + VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); + } + } +} + +template +static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType rows = 32; + IndexType cols = 48; + array means; + means[0] = rows / 2.0f; + means[1] = cols / 2.0f; + array std_devs; + std_devs[0] = 3.14f; + std_devs[1] = 2.7f; + internal::GaussianGenerator gaussian_gen(means, std_devs); + + array tensorRange = {{rows, cols}}; + Tensor matrix(tensorRange); + Tensor result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast(sycl_device.allocate(tensorBuffSize)); + + TensorMap> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < rows; ++i) { + for (IndexType j = 0; j < cols; ++j) { + DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f; + DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f; + DataType gaussian = expf(-g_rows - g_cols); + Eigen::internal::isApprox(result(i, j), gaussian, error_threshold); + } + } +} + +template void sycl_generator_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_1D_sycl(sycl_device); + test_1D_sycl(sycl_device); + test_2D_sycl(sycl_device); + test_2D_sycl(sycl_device); + test_gaussian_sycl(sycl_device); + test_gaussian_sycl(sycl_device); +} +void test_cxx11_tensor_generator_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_generator_test_per_device(device)); + } +} From 0b7875f1376a0f3f22754837712ddd885ca3f4dd Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 24 Feb 2017 18:13:30 +0000 Subject: [PATCH 028/680] Converting fixed float type into template type for TensorContraction. --- .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index e87de0c57..abc7ba551 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -230,13 +230,13 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS const Index nGroupId = itemID.get_group(1); // Work-group ID localCol const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; + LhsScalar privateLhs; + RhsScalar privateRhs[WorkLoadPerThreadN]; + OutScalar privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers for (Index wLPTM=0; wLPTM(0); } } From 2fa2b617a97ba254343c7c1635a9b6d617a100e8 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 24 Feb 2017 19:16:24 +0000 Subject: [PATCH 029/680] Adding TensorVolumePatchOP.h for sycl --- .../TensorSyclConvertToDeviceExpression.h | 10 + .../src/Tensor/TensorSyclExprConstructor.h | 22 ++ .../src/Tensor/TensorSyclExtractAccessor.h | 15 ++ .../src/Tensor/TensorSyclExtractFunctors.h | 44 ++++ .../CXX11/src/Tensor/TensorSyclLeafCount.h | 13 +- .../src/Tensor/TensorSyclPlaceHolderExpr.h | 14 ++ .../CXX11/src/Tensor/TensorVolumePatch.h | 30 ++- unsupported/test/CMakeLists.txt | 5 +- .../test/cxx11_tensor_volume_patchOP_sycl.cpp | 222 ++++++++++++++++++ 9 files changed, 359 insertions(+), 16 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 5b4a9af9f..dd63a2e2f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -177,6 +177,16 @@ KERNELBROKERCONVERTIMAGEPATCHOP() #undef KERNELBROKERCONVERTIMAGEPATCHOP +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorVolumePatchOp +#define KERNELBROKERCONVERTVOLUMEPATCHOP(CVQual)\ +template\ +struct ConvertToDeviceExpression >{\ + typedef CVQual TensorVolumePatchOp::Type> Type;\ +}; +KERNELBROKERCONVERTVOLUMEPATCHOP(const) +KERNELBROKERCONVERTVOLUMEPATCHOP() +#undef KERNELBROKERCONVERTVOLUMEPATCHOP + } // namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 57a10d06b..117b368ec 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -404,6 +404,28 @@ SYCLTENSORIMAGEPATCHOPEXPR(const) SYCLTENSORIMAGEPATCHOPEXPR() #undef SYCLTENSORIMAGEPATCHOPEXPR +// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOPEXPR(CVQual)\ +template\ +struct ExprConstructor, CVQual TensorVolumePatchOp, Params... > {\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorVolumePatchOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\ + funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ + funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value,\ + funcD.m_padding_type, funcD.m_padding_explicit){\ + }\ +}; + +SYCLTENSORVOLUMEPATCHOPEXPR(const) +SYCLTENSORVOLUMEPATCHOPEXPR() +#undef SYCLTENSORVOLUMEPATCHOPEXPR + + // TensorLayoutSwapOp #define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 2be6f3710..4a6322d44 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -240,6 +240,21 @@ SYCLTENSORIMAGEPATCHOPEXTACC() #undef SYCLTENSORIMAGEPATCHOPEXTACC + +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorVolumePatchOp. +#define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORVOLUMEPATCHOPEXTACC(const) +SYCLTENSORVOLUMEPATCHOPEXTACC() +#undef SYCLTENSORVOLUMEPATCHOPEXTACC + + // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorLayoutSwapOp. #define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index dbac01138..8828a0495 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -344,6 +344,50 @@ FunctorExtractor(const TensorEvaluator& expr)\ SYCLEXTRFUNCIMAGEPATCHOP(const) SYCLEXTRFUNCIMAGEPATCHOP() #undef SYCLEXTRFUNCIMAGEPATCHOP + +/// TensorVolumePatchOp +#define SYCLEXTRFUNCVOLUMEPATCHOP(CVQual)\ +template\ +struct FunctorExtractor, Device> >{\ +typedef CVQual TensorVolumePatchOp Self;\ +FunctorExtractor > xprExpr;\ +const DenseIndex m_patch_planes;\ +const DenseIndex m_patch_rows;\ +const DenseIndex m_patch_cols;\ +const DenseIndex m_plane_strides;\ +const DenseIndex m_row_strides;\ +const DenseIndex m_col_strides;\ +const DenseIndex m_in_plane_strides;\ +const DenseIndex m_in_row_strides;\ +const DenseIndex m_in_col_strides;\ +const DenseIndex m_plane_inflate_strides;\ +const DenseIndex m_row_inflate_strides;\ +const DenseIndex m_col_inflate_strides;\ +const bool m_padding_explicit;\ +const DenseIndex m_padding_top_z;\ +const DenseIndex m_padding_bottom_z;\ +const DenseIndex m_padding_top;\ +const DenseIndex m_padding_bottom;\ +const DenseIndex m_padding_left;\ +const DenseIndex m_padding_right;\ +const PaddingType m_padding_type;\ +const typename Self::Scalar m_padding_value;\ +FunctorExtractor(const TensorEvaluator& expr)\ +: xprExpr(expr.impl()), m_patch_planes(expr.xpr().patch_planes()), m_patch_rows(expr.xpr().patch_rows()), m_patch_cols(expr.xpr().patch_cols()),\ + m_plane_strides(expr.xpr().plane_strides()), m_row_strides(expr.xpr().row_strides()), m_col_strides(expr.xpr().col_strides()),\ + m_in_plane_strides(expr.xpr().in_plane_strides()), m_in_row_strides(expr.xpr().in_row_strides()), m_in_col_strides(expr.xpr().in_col_strides()),\ + m_plane_inflate_strides(expr.xpr().plane_inflate_strides()),m_row_inflate_strides(expr.xpr().row_inflate_strides()),\ + m_col_inflate_strides(expr.xpr().col_inflate_strides()), m_padding_explicit(expr.xpr().padding_explicit()),\ + m_padding_top_z(expr.xpr().padding_top_z()), m_padding_bottom_z(expr.xpr().padding_bottom_z()), \ + m_padding_top(expr.xpr().padding_top()), m_padding_bottom(expr.xpr().padding_bottom()), m_padding_left(expr.xpr().padding_left()),\ + m_padding_right(expr.xpr().padding_right()), m_padding_type(expr.xpr().padding_type()),m_padding_value(expr.xpr().padding_value()){}\ +}; +SYCLEXTRFUNCVOLUMEPATCHOP(const) +SYCLEXTRFUNCVOLUMEPATCHOP() +#undef SYCLEXTRFUNCVOLUMEPATCHOP + + + /// template deduction function for FunctorExtractor template auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index b8e658824..50f4595fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -151,7 +151,7 @@ CHIPPINGOPLEAFCOUNT(const) CHIPPINGOPLEAFCOUNT() #undef CHIPPINGOPLEAFCOUNT - +///TensorStridingSlicingOp #define SLICESTRIDEOPLEAFCOUNT(CVQual)\ template\ struct LeafCount >:CategoryCount{}; @@ -160,7 +160,7 @@ SLICESTRIDEOPLEAFCOUNT(const) SLICESTRIDEOPLEAFCOUNT() #undef SLICESTRIDEOPLEAFCOUNT - +//TensorImagePatchOp #define TENSORIMAGEPATCHOPLEAFCOUNT(CVQual)\ template\ struct LeafCount >:CategoryCount{}; @@ -170,6 +170,15 @@ TENSORIMAGEPATCHOPLEAFCOUNT(const) TENSORIMAGEPATCHOPLEAFCOUNT() #undef TENSORIMAGEPATCHOPLEAFCOUNT +// TensorVolumePatchOp +#define TENSORVOLUMEPATCHOPLEAFCOUNT(CVQual)\ +template\ +struct LeafCount >:CategoryCount{}; + + +TENSORVOLUMEPATCHOPLEAFCOUNT(const) +TENSORVOLUMEPATCHOPLEAFCOUNT() +#undef TENSORVOLUMEPATCHOPLEAFCOUNT } /// namespace TensorSycl } /// namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index ab97235ae..fcef0be04 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -235,6 +235,20 @@ SYCLTENSORIMAGEPATCHOP() #undef SYCLTENSORIMAGEPATCHOP + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorVolumePatchOp +#define SYCLTENSORVOLUMEPATCHOP(CVQual)\ +template\ +struct PlaceHolderExpression, N> {\ + typedef CVQual TensorVolumePatchOp::ArgType> Type;\ +}; + +SYCLTENSORVOLUMEPATCHOP(const) +SYCLTENSORVOLUMEPATCHOP() +#undef SYCLTENSORVOLUMEPATCHOP + + /// template deduction for \ref PlaceHolderExpression struct template struct createPlaceHolderExpression { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 0ca2cac84..64474ee80 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -65,12 +65,8 @@ class TensorVolumePatchOp : public TensorBase, D CoordAccess = false, RawAccess = false }; +#ifdef __SYCL_DEVICE_ONLY__ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) +#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_op(op) { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -321,7 +321,9 @@ struct TensorEvaluator, D m_outputPlanesRows = m_outputPlanes * m_outputRows; // Fast representations of different variables. + // printf("THis is m_otherStride: %lu\n", m_otherStride ); m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); m_fastColStride = internal::TensorIntDivisor(m_colStride); m_fastRowStride = internal::TensorIntDivisor(m_rowStride); @@ -338,7 +340,6 @@ struct TensorEvaluator, D m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { @@ -352,6 +353,7 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + // Patch index corresponding to the passed in index. const Index patchIndex = index / m_fastPatchStride; @@ -505,6 +507,8 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -600,6 +604,8 @@ struct TensorEvaluator, D Scalar m_paddingValue; TensorEvaluator m_impl; +// required by sycl + XprType m_op; }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 69c892362..508f29446 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -167,11 +167,12 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp new file mode 100644 index 000000000..ddc9e0d46 --- /dev/null +++ b/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp @@ -0,0 +1,222 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_volume_patchOP_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template +static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + +IndexType sizeDim0 = 4; +IndexType sizeDim1 = 2; +IndexType sizeDim2 = 3; +IndexType sizeDim3 = 5; +IndexType sizeDim4 = 7; +array tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; +array tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}}; +Tensor tensor_col_major(tensorColMajorRange); +Tensor tensor_row_major(tensorRowMajorRange); +tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + + + // single volume patch: ColMajor + array patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}}; + Tensor single_voxel_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange); + gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7); + + array patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}}; + Tensor single_voxel_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange); + gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4); + + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]); + VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + } + + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_col_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_row_major); +} + +template +static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + const int depth = 4; + const int patch_z = 2; + const int patch_y = 3; + const int patch_x = 5; + const int batch = 7; + + array tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}}; + array tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}}; + Tensor tensor_col_major(tensorColMajorRange); + Tensor tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + + // single volume patch: ColMajor + array patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}}; + Tensor entire_volume_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_col_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange); + gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize); + + +// Tensor tensor(depth, patch_z, patch_y, patch_x, batch); +// tensor.setRandom(); +// Tensor tensor_row_major = tensor.swap_layout(); + + //Tensor entire_volume_patch; + //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch); + +// Tensor entire_volume_patch_row_major; + //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + + array patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}}; + Tensor entire_volume_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_row_major = static_cast(sycl_device.allocate(patchTensorBuffSize)); + TensorMap> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange); + gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth); + + const int dz = patch_z - 1; + const int dy = patch_y - 1; + const int dx = patch_x - 1; + + const int forward_pad_z = dz - dz / 2; + const int forward_pad_y = dy - dy / 2; + const int forward_pad_x = dx - dx / 2; + + for (int pz = 0; pz < patch_z; pz++) { + for (int py = 0; py < patch_y; py++) { + for (int px = 0; px < patch_x; px++) { + const int patchId = pz + patch_z * (py + px * patch_y); + for (int z = 0; z < patch_z; z++) { + for (int y = 0; y < patch_y; y++) { + for (int x = 0; x < patch_x; x++) { + for (int b = 0; b < batch; b++) { + for (int d = 0; d < depth; d++) { + float expected = 0.0f; + float expected_row_major = 0.0f; + const int eff_z = z - forward_pad_z + pz; + const int eff_y = y - forward_pad_y + py; + const int eff_x = x - forward_pad_x + px; + if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 && + eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) { + expected = tensor_col_major(d, eff_z, eff_y, eff_x, b); + expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d); + } + VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected); + VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_col_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_row_major); +} + + + +template void sycl_tensor_volume_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +std::cout << "Running on " << s.template get_info() << std::endl; +test_single_voxel_patch_sycl(sycl_device); +test_entire_volume_patch_sycl(sycl_device); +} +void test_cxx11_tensor_volume_patchOP_sycl() +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); +} +} From 34d9fce93bd5f1521017402154a8ab915af8fcb8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:33:33 -0800 Subject: [PATCH 030/680] Avoid unecessary float to double conversions. --- Eigen/src/Core/arch/CUDA/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index ad66399e0..4dda63188 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; + float tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; kernel.packet[1].x = tmp; From 554116bec1b19d417521e9bee767d6b57813492a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:45:31 -0800 Subject: [PATCH 031/680] Added EIGEN_DEVICE_FUNC to make the prototype of the EigenBase override match that of DenseBase --- Eigen/src/Core/EigenBase.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index f76995af9..ccc122cfc 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -128,6 +128,7 @@ template struct EigenBase */ template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const EigenBase &other) { call_assignment(derived(), other.derived()); @@ -136,6 +137,7 @@ Derived& DenseBase::operator=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator+=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -144,6 +146,7 @@ Derived& DenseBase::operator+=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator-=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); From b1fc7c9a09e2aee938036ddba14e870c9658c791 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:48:30 -0800 Subject: [PATCH 032/680] Added missing EIGEN_DEVICE_FUNC qualifiers. --- Eigen/src/Core/ArrayBase.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index af5fb2566..9da960f08 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -175,7 +175,7 @@ template class ArrayBase */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator-=(const ArrayBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -188,7 +188,7 @@ ArrayBase::operator-=(const ArrayBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator+=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -201,7 +201,7 @@ ArrayBase::operator+=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator*=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::mul_assign_op()); @@ -214,7 +214,7 @@ ArrayBase::operator*=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator/=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::div_assign_op()); From ed4dc9d01aa46688e12ef1e0772145c1b222602c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 16:57:01 -0800 Subject: [PATCH 033/680] Declared the plset, ploadt_ro, and ploaddup packet primitives as usable within a gpu kernel --- Eigen/src/Core/GenericPacketMath.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ac5552d3e..d19d5bbd2 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -231,7 +231,7 @@ pload1(const typename unpacket_traits::type *a) { return pset1( * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * Currently, this function is only used for scalar * complex products. */ -template EIGEN_DEVICE_FUNC inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits::type* from) { return *from; } /** \internal \returns a packet with elements of \a *from quadrupled. @@ -279,7 +279,7 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ -template inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ @@ -487,7 +487,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro * by the current computation. */ template -inline Packet ploadt_ro(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) { return ploadt(from); } From 193939d6aaca2d8b4ee7cac9f0a89637596c692f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 17:11:47 -0800 Subject: [PATCH 034/680] Added missing EIGEN_DEVICE_FUNC qualifiers to several nullary op methods. --- Eigen/src/Core/CwiseNullaryOp.h | 80 ++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index dd498f758..ddd607e38 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -150,7 +150,7 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -192,7 +192,7 @@ DenseBase::Constant(Index rows, Index cols, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index size, const Scalar& value) { return DenseBase::NullaryExpr(size, internal::scalar_constant_op(value)); @@ -208,7 +208,7 @@ DenseBase::Constant(Index size, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(const Scalar& value) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -220,7 +220,7 @@ DenseBase::Constant(const Scalar& value) * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -232,7 +232,7 @@ DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const * \sa LinSpaced(Scalar,Scalar) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -264,7 +264,7 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -276,7 +276,7 @@ DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) * Special version for fixed size types which does not require the size parameter. */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -286,7 +286,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ template -bool DenseBase::isApproxToConstant +EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant (const Scalar& val, const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); @@ -301,7 +301,7 @@ bool DenseBase::isApproxToConstant * * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ template -bool DenseBase::isConstant +EIGEN_DEVICE_FUNC bool DenseBase::isConstant (const Scalar& val, const RealScalar& prec) const { return isApproxToConstant(val, prec); @@ -312,7 +312,7 @@ bool DenseBase::isConstant * \sa setConstant(), Constant(), class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) { setConstant(val); } @@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) { return derived() = Constant(rows(), cols(), val); } @@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index size, const Scalar& val) { resize(size); @@ -356,7 +356,7 @@ PlainObjectBase::setConstant(Index size, const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) { resize(rows, cols); @@ -380,7 +380,7 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); @@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return setLinSpaced(size(), low, high); @@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, * \sa Zero(), Zero(Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index rows, Index cols) { return Constant(rows, cols, Scalar(0)); @@ -446,7 +446,7 @@ DenseBase::Zero(Index rows, Index cols) * \sa Zero(), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index size) { return Constant(size, Scalar(0)); @@ -463,7 +463,7 @@ DenseBase::Zero(Index size) * \sa Zero(Index), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero() { return Constant(Scalar(0)); @@ -478,7 +478,7 @@ DenseBase::Zero() * \sa class CwiseNullaryOp, Zero() */ template -bool DenseBase::isZero(const RealScalar& prec) const +EIGEN_DEVICE_FUNC bool DenseBase::isZero(const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); for(Index j = 0; j < cols(); ++j) @@ -496,7 +496,7 @@ bool DenseBase::isZero(const RealScalar& prec) const * \sa class CwiseNullaryOp, Zero() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setZero() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setZero() { return setConstant(Scalar(0)); } @@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setZero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index newSize) { resize(newSize); @@ -529,7 +529,7 @@ PlainObjectBase::setZero(Index newSize) * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index rows, Index cols) { resize(rows, cols); @@ -553,7 +553,7 @@ PlainObjectBase::setZero(Index rows, Index cols) * \sa Ones(), Ones(Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index rows, Index cols) { return Constant(rows, cols, Scalar(1)); @@ -576,7 +576,7 @@ DenseBase::Ones(Index rows, Index cols) * \sa Ones(), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index newSize) { return Constant(newSize, Scalar(1)); @@ -593,7 +593,7 @@ DenseBase::Ones(Index newSize) * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones() { return Constant(Scalar(1)); @@ -608,7 +608,7 @@ DenseBase::Ones() * \sa class CwiseNullaryOp, Ones() */ template -bool DenseBase::isOnes +EIGEN_DEVICE_FUNC bool DenseBase::isOnes (const RealScalar& prec) const { return isApproxToConstant(Scalar(1), prec); @@ -622,7 +622,7 @@ bool DenseBase::isOnes * \sa class CwiseNullaryOp, Ones() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() { return setConstant(Scalar(1)); } @@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index newSize) { resize(newSize); @@ -655,7 +655,7 @@ PlainObjectBase::setOnes(Index newSize) * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index rows, Index cols) { resize(rows, cols); @@ -679,7 +679,7 @@ PlainObjectBase::setOnes(Index rows, Index cols) * \sa Identity(), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity(Index rows, Index cols) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_identity_op()); @@ -696,7 +696,7 @@ MatrixBase::Identity(Index rows, Index cols) * \sa Identity(Index,Index), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity() { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -771,7 +771,7 @@ struct setIdentity_impl * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() { return internal::setIdentity_impl::run(derived()); } @@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) { derived().resize(rows, cols); return setIdentity(); @@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); @@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(),i); @@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() { return Derived::Unit(0); } /** \returns an expression of the Y axis unit vector (0,1{,0}^*) @@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() { return Derived::Unit(1); } /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) @@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() { return Derived::Unit(2); } /** \returns an expression of the W axis unit vector (0,0,0,1) @@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } } // end namespace Eigen From 889c606f8fd242b1cf5e3c8f967ac7dad004775d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 27 Feb 2017 17:17:47 -0800 Subject: [PATCH 035/680] Added missing EIGEN_DEVICE_FUNC to the SelfCwise binary ops --- Eigen/src/Core/SelfCwiseBinaryOp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h index 719ed72a5..50099df82 100644 --- a/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -15,7 +15,7 @@ namespace Eigen { // TODO generalize the scalar type of 'other' template -EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); @@ -23,7 +23,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); @@ -31,7 +31,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); @@ -39,7 +39,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) } template -EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); From 478a9f53be33c23ac5e22e0bb09cad7f719fedd4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 28 Feb 2017 09:32:45 +0100 Subject: [PATCH 036/680] Fix typo. --- unsupported/Eigen/CXX11/src/Tensor/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index fbb7f3bfc..38cdb9c69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -75,16 +75,16 @@ large enough to hold all the data. // Map a tensor of ints on top of stack-allocated storage. int storage[128]; // 2 x 4 x 2 x 8 = 128 - TensorMap t_4d(storage, 2, 4, 2, 8); + TensorMap> t_4d(storage, 2, 4, 2, 8); // The same storage can be viewed as a different tensor. // You can also pass the sizes as an array. - TensorMap t_2d(storage, 16, 8); + TensorMap> t_2d(storage, 16, 8); // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. Tensor> t_4x3; - TensorMap t_12(t_4x3, 12); + TensorMap> t_12(t_4x3, 12); #### Class TensorRef From 4e98a7b2f0501408751e2da5f24d65f642371226 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 28 Feb 2017 09:47:38 +0100 Subject: [PATCH 037/680] bug #1396: add some missing EIGEN_DEVICE_FUNC --- Eigen/src/Core/DenseBase.h | 6 +++--- Eigen/src/Core/MatrixBase.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index fc807577b..91a8511be 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -484,9 +484,9 @@ template class DenseBase return derived().coeff(0,0); } - bool all() const; - bool any() const; - Index count() const; + EIGEN_DEVICE_FUNC bool all() const; + EIGEN_DEVICE_FUNC bool any() const; + EIGEN_DEVICE_FUNC Index count() const; typedef VectorwiseOp RowwiseReturnType; typedef const VectorwiseOp ConstRowwiseReturnType; diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 675c94e12..200e57741 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -294,7 +294,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator!= */ template - inline bool operator==(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase& other) const { return cwiseEqual(other).all(); } /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. @@ -302,7 +302,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator== */ template - inline bool operator!=(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } NoAlias noalias(); From 8296b87d7bd98c19c6064241880691f164790ede Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 28 Feb 2017 17:16:14 +0000 Subject: [PATCH 038/680] Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used. --- .../CXX11/src/Tensor/TensorContractionSycl.h | 21 +-- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 30 ++-- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 6 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 23 ++- .../CXX11/src/Tensor/TensorReductionSycl.h | 16 +- .../TensorSyclConvertToDeviceExpression.h | 1 + .../src/Tensor/TensorSyclExprConstructor.h | 61 +++++-- .../src/Tensor/TensorSyclExtractAccessor.h | 27 +++ .../src/Tensor/TensorSyclExtractFunctors.h | 43 ++++- .../CXX11/src/Tensor/TensorSyclFunctors.h | 21 ++- .../CXX11/src/Tensor/TensorSyclLeafCount.h | 20 +++ .../src/Tensor/TensorSyclPlaceHolderExpr.h | 27 +++ unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_custom_op_sycl.cpp | 165 ++++++++++++++++++ .../test/cxx11_tensor_forced_eval_sycl.cpp | 2 +- 15 files changed, 397 insertions(+), 67 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_custom_op_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index abc7ba551..fcd7d4d00 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -84,7 +84,7 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { + if (data) { evalTo(data); return false; } else { @@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; + size_t out_offset; Index roundUpK, M, N, K; ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; @@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS Device dev; - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), + out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), @@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS for (Index wLPTN=0; wLPTN(cgh, self.left_impl())) LHSTupleType; @@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo typedef cl::sycl::accessor RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - typedef cl::sycl::accessor OutAccessor; + typedef cl::sycl::accessor OutAccessor; //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); - + OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, + localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); }); self.device().asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 4247c1c4a..66ffd819f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -32,14 +32,15 @@ internal::IndexMapper::La Kernel_accessor kernel_filter; const size_t kernelSize, range_x, range_y; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel1D(internal::IndexMapper::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<2> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; @@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel2D(internal::IndexMapper::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; @@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper::La Kernel_accessor kernel_filter; const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; Buffer_accessor buffer_acc; +ptrdiff_t out_offset; Local_accessor local_acc; FunctorExpr functors; TupleType tuple_of_accessors; EigenConvolutionKernel3D(internal::IndexMapper::Layout> indexMapper_, Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} void operator()(cl::sycl::nd_item<3> itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; @@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper EvalTo; EvalTo evalToTmp(local, m_kernelArg); @@ -325,6 +328,7 @@ struct TensorEvaluator InputFunctorExpr; // extract input functor list InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + ptrdiff_t out_offset = m_device.get_offset(data); m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { @@ -358,7 +362,7 @@ struct TensorEvaluator(global_range, local_range), EigenConvolutionKernel1D( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -383,7 +387,7 @@ struct TensorEvaluator(global_range, local_range), EigenConvolutionKernel2D( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } @@ -412,7 +416,7 @@ struct TensorEvaluator( indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors)); break; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index e020d076f..c72d79435 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -140,6 +140,9 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { TensorMap > result( @@ -295,6 +298,9 @@ struct TensorEvaluator > result(data, m_dimensions); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799bb..964222a15 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -18,6 +18,8 @@ namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) + #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) + template class MemCopyFunctor { public: @@ -43,11 +45,12 @@ namespace Eigen { struct memsetkernelFunctor{ typedef cl::sycl::accessor AccType; AccType m_acc; + const ptrdiff_t buff_offset; const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} + memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc), buff_offset(buff_offset_), m_rng(rng), m_c(c){} void operator()(cl::sycl::nd_item<1> itemID) { auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; + if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c; } }; @@ -305,6 +308,11 @@ struct SyclDevice { synchronize(); } + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + auto it = m_queue_stream->find_buffer(ptr); + return (static_cast(ptr))-it->first; + + } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the @@ -343,20 +351,23 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast(static_cast(data))),rng, GRange, tileSize, c )); + auto it1 = m_queue_stream->find_buffer(static_cast(data)); + ptrdiff_t buff_offset= (static_cast(data)) - it1->first; + sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); synchronize(); } struct memsetCghFunctor{ cl::sycl::buffer& m_buf; + const ptrdiff_t& buff_offset; const size_t& rng , GRange, tileSize; const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { auto buf_acc = m_buf.template get_access(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129e2..c9c7acfdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -27,9 +27,9 @@ namespace internal { template struct syclGenericBufferReducer{ template -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, @@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, out_offset, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev template struct syclGenericBufferReducer, CoeffReturnType>{ template -static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ syclGenericBufferReducer, CoeffReturnType>::run(Eigen::internal::SumReducer(), - bufOut, bufI, dev, length, local); + bufOut, out_offset, bufI, dev, length, local); } }; @@ -127,8 +127,9 @@ struct FullReducer { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); + ptrdiff_t out_offset = dev.get_offset(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -158,10 +159,11 @@ struct InnerReducer { // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor(cgh, output); + ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index dd63a2e2f..9476c0ea8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -109,6 +109,7 @@ struct ConvertToDeviceExpression > {\ typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ }; + // TensorForcedEvalOp KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 117b368ec..af4eb5f13 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -236,8 +236,12 @@ EVALTO() template \ struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - typedef CVQual TensorMap::Scalar,\ - TensorForcedEvalOp::NumDimensions, Eigen::internal::traits>::Layout, typename TensorForcedEvalOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + typedef TensorForcedEvalOp XprType;\ + typedef CVQual TensorMap<\ + Tensor::Layout,typename XprType::Index>,\ + Eigen::internal::traits::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ @@ -248,6 +252,28 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + + +#define TENSORCUSTOMUNARYOP(CVQual)\ +template \ +struct ExprConstructor,\ +CVQual PlaceHolder, N>, Params...> {\ + typedef TensorCustomUnaryOp XprType;\ + typedef CVQual TensorMap<\ + Tensor::Layout,typename XprType::Index>,\ + Eigen::internal::traits::Layout, \ + MakeGlobalPointer\ + > Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ + : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ +}; + +TENSORCUSTOMUNARYOP(const) +TENSORCUSTOMUNARYOP() +#undef TENSORCUSTOMUNARYOP + template struct ValueCondition { static const size_t Res =X; }; @@ -260,7 +286,7 @@ template struct ValueCondition { template \ struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - static const size_t NumIndices= ValueCondition< TensorReductionOp::NumDimensions==0, 1, TensorReductionOp::NumDimensions >::Res;\ + static const auto NumIndices= ValueCondition< TensorReductionOp::NumDimensions==0, 1, TensorReductionOp::NumDimensions >::Res;\ typedef CVQual TensorMap::Scalar,\ NumIndices, Eigen::internal::traits>::Layout, typename TensorReductionOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ Type expr;\ @@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR() /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp +#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\ template \ struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits >::NumDimensions;\ - typedef CVQual TensorMap::Scalar,\ - NumIndices, Eigen::internal::traits >::Layout,\ - typename ExprNode::Index>,\ - Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + typedef ExprNode XprTyp;\ + static const auto NumIndices= Eigen::internal::traits::NumDimensions;\ + typedef CVQual TensorMap<\ + Tensor::Layout, typename XprTyp::Index>,\ + Eigen::internal::traits::Layout, \ + MakeGlobalPointer\ + > Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ }; -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - +SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) +SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) +#undef SYCLCONTRACTCONVCUSBIOPS #define SYCLSLICEOPEXPR(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 4a6322d44..5a6a8f4c5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC +#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ +template \ +struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ +}; + + +SYCLCUSTOMUNARYOPEXTACC(const) +SYCLCUSTOMUNARYOPEXTACC() +#undef SYCLCUSTOMUNARYOPEXTACC + + +#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ +template \ +struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ +}; + +SYCLCUSTOMBINARYOPEXTACC(const) +SYCLCUSTOMBINARYOPEXTACC() +#undef SYCLCUSTOMBIBARYOPEXTACC + + + + /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ template \ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 8828a0495..9fcac5ecb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -33,14 +33,17 @@ namespace internal { /// re-instantiate them on the device. /// We have to pass instantiated functors to the device. // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval). +#define DEFALTACTION(Evaluator)\ +typedef typename Evaluator::Dimensions Dimensions;\ +const Dimensions m_dimensions;\ +EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ +FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {} + template struct FunctorExtractor{ - typedef typename Evaluator::Dimensions Dimensions; - const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - FunctorExtractor(const Evaluator& expr) - : m_dimensions(expr.dimensions()) {} + DEFALTACTION(Evaluator) }; + /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything ///TensorConversionOp #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ @@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const) SYCLEXTRFUNCTERNARY() #undef SYCLEXTRFUNCTERNARY + + +//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different +//from the UnaryCategory and it is similar to the general FunctorExtractor. +/// specialisation of TensorCustomOp +#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\ +template \ +struct FunctorExtractor, Dev> > {\ + typedef TensorEvaluator, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; + +SYCLEXTRFUNCCUSTOMUNARYOP(const) +SYCLEXTRFUNCCUSTOMUNARYOP() +#undef SYCLEXTRFUNCCUSTOMUNARYOP + + +#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ +template \ +struct FunctorExtractor, Dev> > {\ + typedef TensorEvaluator, Dev> Evaluator;\ + DEFALTACTION(Evaluator)\ +}; + +SYCLEXTRFUNCCUSTOMBIBARYOP(const) +SYCLEXTRFUNCCUSTOMBIBARYOP() +#undef SYCLEXTRFUNCCUSTOMBIBARYOP + + + /// specialisation of the \ref FunctorExtractor struct when the node type is /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCSELECTOP(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 2f7779036..12237bfab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -21,11 +21,12 @@ namespace internal { template struct GenericKernelReducer{ OP op; OutputAccessor aOut; + ptrdiff_t out_offset; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -59,7 +60,7 @@ namespace internal { aI[itemID.get_group(0)] = scratch[localid]; if((length<=local) && globalid ==0){ auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; + aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0]; } } } @@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; typedef cl::sycl::accessor write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression::Type DevExpr; @@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen typename DeviceSelf::CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast(globalid)),const_cast(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum; + output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; @@ -111,9 +113,9 @@ class ReductionFunctor::Type PlaceHolderExpr; typedef cl::sycl::accessor write_accessor; typedef Eigen::internal::SumReducer Op; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} void operator()(cl::sycl::nd_item<1> itemID) { typedef typename ConvertToDeviceExpression::Type DevExpr; @@ -133,11 +135,12 @@ class ReductionFunctor::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast(globalid)),const_cast(functor), &accum); functor.finalize(accum); - output_accessor_ptr[globalid]= accum/num_values_to_reduce; + output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce; } } private: write_accessor output_accessor; + ptrdiff_t out_offset; FunctorExpr functors; Tuple_of_Acc tuple_of_accessors; Dims dims; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 50f4595fc..330283b39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const) SYCLFORCEDEVALLEAFCOUNT() #undef SYCLFORCEDEVALLEAFCOUNT +#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\ +template \ +struct LeafCount > {\ +static const size_t Count =1;\ +}; + +SYCLCUSTOMUNARYOPLEAFCOUNT(const) +SYCLCUSTOMUNARYOPLEAFCOUNT() +#undef SYCLCUSTOMUNARYOPLEAFCOUNT + + +#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\ +template \ +struct LeafCount > {\ +static const size_t Count =1;\ +}; +SYCLCUSTOMBINARYOPLEAFCOUNT( const) +SYCLCUSTOMBINARYOPLEAFCOUNT() +#undef SYCLCUSTOMBINARYOPLEAFCOUNT + /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp #define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ template \ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index fcef0be04..99d528963 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -143,6 +143,33 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMUNARYOPEVAL(CVQual)\ +template \ +struct PlaceHolderExpression, N> {\ + typedef CVQual PlaceHolder, N> Type;\ +}; + +CUSTOMUNARYOPEVAL(const) +CUSTOMUNARYOPEVAL() +#undef CUSTOMUNARYOPEVAL + + +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorForcedEvalOp +#define CUSTOMBINARYOPEVAL(CVQual)\ +template \ +struct PlaceHolderExpression, N> {\ + typedef CVQual PlaceHolder, N> Type;\ +}; + +CUSTOMBINARYOPEVAL(const) +CUSTOMBINARYOPEVAL() +#undef CUSTOMBINARYOPEVAL + + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorEvalToOp, TensorLayoutSwapOp #define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 508f29446..996178292 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp new file mode 100644 index 000000000..9ff287fff --- /dev/null +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -0,0 +1,165 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +template +struct InsertZeros { + DSizes dimensions(const TensorType& input) const { + DSizes result; + result[0] = input.dimension(0) * 2; + result[1] = input.dimension(1) * 2; + return result; + } + + template + void eval(const TensorType& input, Output& output, const Device& device) const + { + array strides; + strides[0] = 2; + strides[1] = 2; + output.stride(strides).device(device) = input; + + Eigen::DSizes offsets(1,1); + Eigen::DSizes extents(output.dimension(0)-1, output.dimension(1)-1); + output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f); + } +}; + +template +static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 5; + Eigen::array tensorRange = {{sizeDim1, sizeDim2}}; + Eigen::array tensorResultRange = {{6, 10}}; + + Eigen::Tensor in1(tensorRange); + Eigen::Tensor out(tensorResultRange); + + DataType * gpu_in1_data = static_cast(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(out.dimension(0), 6); + VERIFY_IS_EQUAL(out.dimension(1), 10); + + for (int i = 0; i < 6; i+=2) { + for (int j = 0; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2)); + } + } + for (int i = 1; i < 6; i+=2) { + for (int j = 1; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), 0); + } + } +} + +template +struct BatchMatMul { + DSizes dimensions(const TensorType& input1, const TensorType& input2) const { + DSizes result; + result[0] = input1.dimension(0); + result[1] = input2.dimension(1); + result[2] = input2.dimension(2); + return result; + } + + template + void eval(const TensorType& input1, const TensorType& input2, + Output& output, const Device& device) const + { + typedef typename TensorType::DimensionPair DimPair; + array dims; + dims[0] = DimPair(1, 0); + for (int64_t i = 0; i < output.dimension(2); ++i) { + output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims); + } + } +}; + +template +static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + + Eigen::array tensorRange1 = {{2, 3, 5}}; + Eigen::array tensorRange2 = {{3,7,5}}; + Eigen::array tensorResultRange = {{2, 7, 5}}; + + Eigen::Tensor in1(tensorRange1); + Eigen::Tensor in2(tensorRange2); + Eigen::Tensor out(tensorResultRange); + + DataType * gpu_in1_data = static_cast(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange1); + TensorType gpu_in2(gpu_in2_data, tensorRange2); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + in2.setRandom(); + + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); + + gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + for (IndexType i = 0; i < 5; ++i) { + typedef typename Eigen::Tensor::DimensionPair DimPair; + array dims; + dims[0] = DimPair(1, 0); + Eigen::Tensor reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims); + TensorRef > val = out.template chip<2>(i); + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(val(j, k), reference(j, k)); + } + } + } +} + +template void custom_op_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_custom_unary_op_sycl(sycl_device); + test_custom_unary_op_sycl(sycl_device); + test_custom_binary_op_sycl(sycl_device); + test_custom_binary_op_sycl(sycl_device); + +} +void test_cxx11_tensor_custom_op_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(custom_op_perDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index aca036cde..a21514d56 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { Eigen::TensorMap> gpu_in2(gpu_in2_data, tensorRange); Eigen::TensorMap> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); From f3e9c42876de4b49bdb16b72b09a342d930ce6f0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 09:46:30 -0800 Subject: [PATCH 039/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/Diagonal.h | 10 ++++++---- Eigen/src/Core/Transpose.h | 10 +++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 49e711257..c62f5ff21 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -184,7 +184,7 @@ template class Diagonal * * \sa class Diagonal */ template -inline typename MatrixBase::DiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -192,7 +192,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -210,7 +210,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -218,7 +218,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -237,6 +237,7 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -246,6 +247,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 79b767bcc..ba7d6e629 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -168,7 +168,7 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -inline Transpose +EIGEN_DEVICE_FUNC inline Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -180,7 +180,7 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -206,7 +206,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -inline const typename MatrixBase::AdjointReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -281,7 +281,7 @@ struct inplace_transpose_selector { // non squ * * \sa transpose(), adjoint(), adjointInPlace() */ template -inline void DenseBase::transposeInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -312,7 +312,7 @@ inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -inline void MatrixBase::adjointInPlace() +EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } From 33443ec2b0d116daa5210fc5e7982ca4a1598bc7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 09:50:10 -0800 Subject: [PATCH 040/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/Redux.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index b6e8f8887..2b5b73bf7 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -407,7 +407,7 @@ protected: */ template template -typename internal::traits::Scalar +EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); @@ -422,7 +422,7 @@ DenseBase::redux(const Func& func) const * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { return derived().redux(Eigen::internal::scalar_min_op()); @@ -432,7 +432,7 @@ DenseBase::minCoeff() const * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { return derived().redux(Eigen::internal::scalar_max_op()); @@ -445,7 +445,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -458,7 +458,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -479,7 +479,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -494,7 +494,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); From e993c94f07faef161076c8710e8cb434174dd250 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 09:56:45 -0800 Subject: [PATCH 041/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/CommaInitializer.h | 4 ++-- Eigen/src/Core/CwiseBinaryOp.h | 5 ++--- Eigen/src/Core/CwiseNullaryOp.h | 4 ++-- Eigen/src/Core/Fuzzy.h | 6 +++--- Eigen/src/Core/NestByValue.h | 10 +++++----- Eigen/src/Core/ReturnByValue.h | 2 +- 6 files changed, 15 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index d218e9814..35fdbb819 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -141,7 +141,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -inline CommaInitializer DenseBase::operator<< (const Scalar& s) +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -149,7 +149,7 @@ inline CommaInitializer DenseBase::operator<< (const Scalar& s /** \sa operator<<(const Scalar&) */ template template -inline CommaInitializer +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index a36765e39..bf2632d9e 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -158,7 +158,7 @@ public: */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -171,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -181,4 +181,3 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H - diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index ddd607e38..144608ec2 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -131,7 +131,7 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -170,7 +170,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h index 3e403a09d..43aa49b2b 100644 --- a/Eigen/src/Core/Fuzzy.h +++ b/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -bool DenseBase::isApprox( +EIGEN_DEVICE_FUNC bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ bool DenseBase::isMuchSmallerThan( */ template template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 13adf070e..01cf192e9 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -67,25 +67,25 @@ template class NestByValue } template - inline const PacketScalar packet(Index row, Index col) const + EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const { return m_expression.template packet(row, col); } template - inline void writePacket(Index row, Index col, const PacketScalar& x) + EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x) { m_expression.const_cast_derived().template writePacket(row, col, x); } template - inline const PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const { return m_expression.template packet(index); } template - inline void writePacket(Index index, const PacketScalar& x) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x) { m_expression.const_cast_derived().template writePacket(index, x); } @@ -99,7 +99,7 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -inline const NestByValue +EIGEN_DEVICE_FUNC inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index c44b7673b..11dc86d07 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h @@ -79,7 +79,7 @@ template class ReturnByValue template template -Derived& DenseBase::operator=(const ReturnByValue& other) +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); From 765f4cc4b4774fc114ab794e998c5ab4f2d733f9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 11:57:00 -0800 Subject: [PATCH 042/680] Deleted extra: EIGEN_DEVICE_FUNC: the QR and Cholesky code isn't ready to run on GPU yet. --- Eigen/src/Cholesky/LDLT.h | 1 - Eigen/src/Cholesky/LLT.h | 1 - Eigen/src/QR/ColPivHouseholderQR.h | 1 - Eigen/src/QR/CompleteOrthogonalDecomposition.h | 2 +- Eigen/src/QR/FullPivHouseholderQR.h | 9 ++++----- Eigen/src/QR/HouseholderQR.h | 9 ++++----- 6 files changed, 9 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index fcee7b2e3..9b4fdb414 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -258,7 +258,6 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 87ca8d423..e6c02d803 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -200,7 +200,6 @@ template class LLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 0e47c8332..d35395d04 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -416,7 +416,6 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 34c637b70..13b61fcdb 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -367,7 +367,7 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; + void _solve_impl(const RhsType& rhs, DstType& dst) const; #endif protected: diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index e489bddc2..c31e47cc4 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -392,22 +392,21 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 3513d995c..762b21c36 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -204,28 +204,27 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; From de7b0fdea9db957d2135c32e850ad069b64b5f1e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 13:52:22 -0800 Subject: [PATCH 043/680] Made the TensorStorage class compile with clang 3.9 --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 2854a4a17..e6a666f78 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -31,12 +31,12 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template -class TensorStorage +template +class TensorStorage { private: static const std::size_t Size = FixedDimensions::total_size; @@ -66,7 +66,7 @@ class TensorStorage // pure dynamic -template +template class TensorStorage, Options_> { public: From 4a7df114c883eb17251f25b8b975c0ddf266acd6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 14:00:15 -0800 Subject: [PATCH 044/680] Added missing EIGEN_DEVICE_FUNC --- Eigen/src/Core/Assign.h | 2 +- Eigen/src/Core/Dot.h | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 53806ba33..655412efd 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 06ef18b8b..bb8e3fecc 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -90,7 +90,7 @@ MatrixBase::dot(const MatrixBase& other) const * \sa dot(), norm(), lpNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -102,7 +102,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +117,7 @@ inline typename NumTraits::Scalar>::Real Matr * \sa norm(), normalize() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC inline const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +139,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -inline void MatrixBase::normalize() +EIGEN_DEVICE_FUNC inline void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +160,7 @@ inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC inline const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +185,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -inline void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC inline void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -257,9 +257,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NumTraits::Scalar>::Real +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real #else -MatrixBase::RealScalar +EIGEN_DEVICE_FUNC MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { From c36bc2d445596d46c7f5a9271bfa69c79e2e1558 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 14:58:45 -0800 Subject: [PATCH 045/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/BooleanRedux.h | 6 +++--- Eigen/src/Core/DiagonalProduct.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index ed607d5d8..ccf519067 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -76,7 +76,7 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -inline bool DenseBase::all() const +EIGEN_DEVICE_FUNC inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { @@ -100,7 +100,7 @@ inline bool DenseBase::all() const * \sa all() */ template -inline bool DenseBase::any() const +EIGEN_DEVICE_FUNC inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { @@ -124,7 +124,7 @@ inline bool DenseBase::any() const * \sa all(), any() */ template -inline Eigen::Index DenseBase::count() const +EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h index d372b938f..7911d1cd1 100644 --- a/Eigen/src/Core/DiagonalProduct.h +++ b/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -inline const Product +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); From 857adbbd52bb1a36c913a828fb5f24b95deee965 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 16:42:00 -0800 Subject: [PATCH 046/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/Random.h | 2 +- Eigen/src/Core/Replicate.h | 4 ++-- Eigen/src/Core/Reverse.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h index 6faf789c7..486e9ed52 100644 --- a/Eigen/src/Core/Random.h +++ b/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -inline Derived& DenseBase::setRandom() +EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index 9960ef884..0b2d6d743 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h @@ -115,7 +115,7 @@ template class Replicate */ template template -const Replicate +EIGEN_DEVICE_FUNC const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -const typename VectorwiseOp::ReplicateReturnType +EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 0640cda2a..8b6b3ab03 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -114,7 +114,7 @@ template class Reverse * */ template -inline typename DenseBase::ReverseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +136,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -inline void DenseBase::reverseInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -void VectorwiseOp::reverseInPlace() +EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); } From c92406d6137e4c55fc608c6916cc20899d00cff4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 17:03:11 -0800 Subject: [PATCH 047/680] Silenced clang compilation warning. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index e6cee11ef..be8d69386 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -217,7 +217,10 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } From 7b61944669f23a20f6c850b9c07d930c049c6ede Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 17:05:28 -0800 Subject: [PATCH 048/680] Made most of the packet math primitives usable within CUDA kernel when compiling with clang --- Eigen/src/Core/arch/CUDA/PacketMath.h | 4 ++-- Eigen/src/Core/util/Macros.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 4dda63188..8c46af09b 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const d return make_double2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { return make_float4(from[0], from[0], from[1], from[1]); } -template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { return make_double2(from[0], from[0]); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 29c796647..14ec87da8 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -542,8 +542,8 @@ // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG From 3a3f040baa602e13fe36d2949712d4af1b2db354 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Feb 2017 17:06:15 -0800 Subject: [PATCH 049/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/VectorwiseOp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 4fe267e9f..893bc796f 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -670,7 +670,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::ColwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -684,7 +684,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::RowwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); From c1d87ec110d9ee96da39d58c8f88481f5cb9d04c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Mar 2017 10:08:50 -0800 Subject: [PATCH 050/680] Added missing EIGEN_DEVICE_FUNC qualifiers --- Eigen/src/Core/CoreEvaluators.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 412f5a661..54276b836 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -139,14 +139,14 @@ public: EIGEN_ONLY_USED_FOR_DEBUG(outerStride); eigen_internal_assert(outerStride==OuterStride); } - Index outerStride() const { return OuterStride; } + EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; } const Scalar *data; }; template class plainobjectbase_evaluator_data { public: plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} - Index outerStride() const { return m_outerStride; } + EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; } const Scalar *data; protected: Index m_outerStride; From 1e2d046651102c57f5f4eca38ff7844e1b0ca6fd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Mar 2017 10:13:42 -0800 Subject: [PATCH 051/680] Silenced a couple of compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 + unsupported/Eigen/CXX11/src/util/EmulateArray.h | 1 + 2 files changed, 2 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index 3523e7c94..d23f2e4c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -23,6 +23,7 @@ struct static_val { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_UNUSED_VARIABLE(v); eigen_assert(v == n); } }; diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 03169d591..573ca435a 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -169,6 +169,7 @@ template class array { #if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { + EIGEN_UNUSED_VARIABLE(l); eigen_assert(l.size() == 0); } #endif From 09ae0e6586b978ce1ea9960984e1228dfc8971b8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Mar 2017 11:47:47 -0800 Subject: [PATCH 052/680] Adjusted the EIGEN_DEVICE_FUNC qualifiers to make sure that: * they're used consistently between the declaration and the definition of a function * we avoid calling host only methods from host device methods. --- Eigen/src/Core/CoreEvaluators.h | 4 ++-- Eigen/src/Core/DiagonalMatrix.h | 4 ++-- Eigen/src/Core/GeneralProduct.h | 2 +- Eigen/src/Core/SelfAdjointView.h | 4 ++-- Eigen/src/Core/SolveTriangular.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 15 ++++++++------- .../Core/products/GeneralMatrixMatrixTriangular.h | 6 +++--- Eigen/src/Core/products/SelfadjointProduct.h | 2 +- Eigen/src/Core/products/SelfadjointRank2Update.h | 2 +- Eigen/src/Core/util/IntegralConstant.h | 4 ++-- Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h | 4 ++-- Eigen/src/LU/FullPivLU.h | 2 -- Eigen/src/SVD/SVDBase.h | 1 - 13 files changed, 25 insertions(+), 27 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 54276b836..15b361b38 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -134,7 +134,7 @@ private: // this helper permits to completely eliminate m_outerStride if it is known at compiletime. template class plainobjectbase_evaluator_data { public: - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { EIGEN_ONLY_USED_FOR_DEBUG(outerStride); eigen_internal_assert(outerStride==OuterStride); @@ -145,7 +145,7 @@ public: template class plainobjectbase_evaluator_data { public: - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; } const Scalar *data; protected: diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index ecfdce8ef..4e8297ee6 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -273,7 +273,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -inline const DiagonalWrapper +EIGEN_DEVICE_FUNC inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 0f16cd8e3..b206b0a7a 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -428,7 +428,7 @@ MatrixBase::operator*(const MatrixBase &other) const template template const Product -MatrixBase::lazyProduct(const MatrixBase &other) const +EIGEN_DEVICE_FUNC MatrixBase::lazyProduct(const MatrixBase &other) const { enum { ProductIsValid = Derived::ColsAtCompileTime==Dynamic diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 504c98f0e..7e71fe3c0 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -322,7 +322,7 @@ public: /** This is the const version of MatrixBase::selfadjointView() */ template template -typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -339,7 +339,7 @@ MatrixBase::selfadjointView() const */ template template -typename MatrixBase::template SelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 049890b25..a0011d4f9 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -164,7 +164,7 @@ struct triangular_solver_selector { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 667ef09dc..ed80da36a 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -488,7 +488,6 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template - EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -554,7 +553,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -564,7 +563,7 @@ TriangularViewImpl::operator=(const MatrixBase template -void TriangularViewImpl::lazyAssign(const MatrixBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -573,7 +572,7 @@ void TriangularViewImpl::lazyAssign(const MatrixBase template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -583,7 +582,7 @@ TriangularViewImpl::operator=(const TriangularBase template -void TriangularViewImpl::lazyAssign(const TriangularBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -598,7 +597,7 @@ void TriangularViewImpl::lazyAssign(const TriangularBas * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalTo(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -624,6 +623,7 @@ void TriangularBase::evalTo(MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -633,6 +633,7 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -930,7 +931,7 @@ struct triangular_assignment_loop * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalToLazy(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 7122efa60..ad38bcf51 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -292,12 +292,12 @@ struct general_product_to_triangular_selector template template -TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index f038d686f..39c5b59ff 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae364111..d395888e5 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -57,7 +57,7 @@ template struct conj_expr_if template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index ae41015bd..78a4705cd 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -151,9 +151,9 @@ struct get_fixed_value,Default> { static const int value = N; }; -template Index get_runtime_value(const T &x) { return x; } +template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } #if !EIGEN_HAS_CXX14 -template Index get_runtime_value(FixedInt (*)()) { return N; } +template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } #endif // Cleanup integer/FixedInt/VariableAndFixedInt/etc types: diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 4fec8af0a..dbbd4806a 100644 --- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -85,7 +85,7 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -inline typename SelfAdjointView::EigenvaluesReturnType +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { typedef typename SelfAdjointView::PlainObject PlainObject; @@ -149,7 +149,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -inline typename SelfAdjointView::RealScalar +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 03b6af706..ec61086d5 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -411,11 +411,9 @@ template class FullPivLU #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index cc90a3b75..429414797 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -212,7 +212,6 @@ public: #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif From a71943b9a432c8962f025b56313584f33111ace4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Mar 2017 10:47:29 -0800 Subject: [PATCH 053/680] Made the Tensor code compile with clang 3.9 --- .../CXX11/src/Tensor/TensorContractionCuda.h | 101 +++++++++--------- .../CXX11/src/Tensor/TensorReductionCuda.h | 1 - 2 files changed, 48 insertions(+), 54 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index d65dbb40f..c04b784a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; // prefetch registers float4 lhs_pf0, rhs_pf0; @@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ Index lhs_vert = base_m+threadIdx.x*4; @@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { - typedef float Scalar; // prefetch registers float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; @@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, typedef float2 LHS_MEM[64][32]; typedef float2 RHS_MEM[128][8]; - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 65638b6a8..edb0ab280 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -287,7 +287,6 @@ struct FullReductionLauncher< void>::type> { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); From bbe717fa2f9fe4828c845b27f4a7d4bb77fffdb2 Mon Sep 17 00:00:00 2001 From: Julian Kent Date: Fri, 3 Mar 2017 12:58:51 +0100 Subject: [PATCH 054/680] Make scaling work with non-square matrices --- unsupported/Eigen/src/IterativeSolvers/Scaling.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h index d113e6e90..9b3eb53e0 100644 --- a/unsupported/Eigen/src/IterativeSolvers/Scaling.h +++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h @@ -104,12 +104,18 @@ class IterScaling for (int i = 0; i < m; ++i) { Dr(i) = std::sqrt(Dr(i)); + } + for (int i = 0; i < n; ++i) + { Dc(i) = std::sqrt(Dc(i)); } // Save the scaling factors for (int i = 0; i < m; ++i) { m_left(i) /= Dr(i); + } + for (int i = 0; i < n; ++i) + { m_right(i) /= Dc(i); } // Scale the rows and the columns of the matrix From 1c03d43a5cc421da067f12fe02db9eedaa1d125d Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Mon, 6 Mar 2017 12:01:12 +0100 Subject: [PATCH 055/680] Fixed compilation with cuda-clang --- Eigen/src/Core/MathFunctions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 7a6b999af..5ec6c395e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -512,7 +512,7 @@ namespace std_fallback { template struct expm1_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH @@ -549,7 +549,7 @@ namespace std_fallback { template struct log1p_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH From 659087b622e94f7e35a56b7ed2cb01b024c80a7b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:02:34 +0100 Subject: [PATCH 056/680] bug #1400: fix stableNorm with EIGEN_DONT_ALIGN_STATICALLY --- Eigen/src/Core/StableNorm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index d2fe1e199..be04ed44d 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -170,7 +170,8 @@ MatrixBase::stableNorm() const enum { CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization }; typedef typename internal::conditional, internal::evaluator::Alignment>, typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; From d9677185250d780b0011202621c90530937049d2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:16:39 +0100 Subject: [PATCH 057/680] do not include std header within extern C --- Eigen/src/misc/lapacke.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h index 8c7e79b03..dd6fd3b0a 100755 --- a/Eigen/src/misc/lapacke.h +++ b/Eigen/src/misc/lapacke.h @@ -43,10 +43,6 @@ #include "lapacke_config.h" #endif -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - #include #ifndef lapack_int @@ -81,7 +77,7 @@ extern "C" { #endif #ifndef lapack_complex_float_real -#define lapack_complex_float_real(z) (creal(z)) +#define lapack_complex_float_rea@l(z) (creal(z)) #endif #ifndef lapack_complex_float_imag @@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + #ifndef LAPACKE_malloc #define LAPACKE_malloc( size ) malloc( size ) #endif From e958c2baac15ccf0cf7b7919ef729b118e43a6ed Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:47:40 +0100 Subject: [PATCH 058/680] remove UTF8 symbols --- Eigen/src/Core/arch/CUDA/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index db9878796..67518da9f 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -13,7 +13,7 @@ // Redistribution and use in source and binary forms, with or without // modification, are permitted. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, From 5694315fbb7a2abc157cad284852b5e3df2f9576 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 10:53:47 +0100 Subject: [PATCH 059/680] remove UTF8 symbol --- Eigen/src/Core/DenseBase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 91a8511be..fd933eed4 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -296,7 +296,7 @@ template class DenseBase EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue& func); - /** \ínternal + /** \internal * Copies \a other into *this without evaluating other. \returns a reference to *this. * \deprecated */ template From e5156e4d253fcc8ac13b0131973de3f56e810ab5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Mar 2017 11:25:58 +0100 Subject: [PATCH 060/680] fix typo --- Eigen/src/misc/lapacke.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h index dd6fd3b0a..3d8e24f5a 100755 --- a/Eigen/src/misc/lapacke.h +++ b/Eigen/src/misc/lapacke.h @@ -77,7 +77,7 @@ #endif #ifndef lapack_complex_float_real -#define lapack_complex_float_rea@l(z) (creal(z)) +#define lapack_complex_float_real(z) (creal(z)) #endif #ifndef lapack_complex_float_imag From f84963ed95ff277bf3abb2e2517b3017a25ccf3f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 7 Mar 2017 14:27:10 +0000 Subject: [PATCH 061/680] Adding TensorIndexTuple and TensorTupleReduceOP backend (ArgMax/Min) for sycl; fixing the address space issue for const TensorMap; converting all discard_write to write due to data missmatch. --- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 19 +- .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 146 ++++++++++ .../CXX11/src/Tensor/TensorContractionSycl.h | 6 +- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 268 +++++++++++------- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 5 + .../Eigen/CXX11/src/Tensor/TensorMeta.h | 2 + .../CXX11/src/Tensor/TensorReductionSycl.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 12 + .../TensorSyclConvertToDeviceExpression.h | 36 ++- .../src/Tensor/TensorSyclExprConstructor.h | 77 +++-- .../src/Tensor/TensorSyclExtractAccessor.h | 48 ++-- .../src/Tensor/TensorSyclExtractFunctors.h | 66 +++-- .../CXX11/src/Tensor/TensorSyclFunctors.h | 16 +- .../CXX11/src/Tensor/TensorSyclLeafCount.h | 34 ++- .../src/Tensor/TensorSyclPlaceHolderExpr.h | 38 ++- .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 3 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_argmax_sycl.cpp | 248 ++++++++++++++++ 19 files changed, 813 insertions(+), 226 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h create mode 100644 unsupported/test/cxx11_tensor_argmax_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index d06f40cd8..e81001c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,6 +119,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + // required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { + return m_impl; + } + + protected: TensorEvaluator m_impl; }; @@ -222,7 +228,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { + m_return_dim(op.return_dim()), m_device(device) { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { @@ -252,7 +258,16 @@ struct TensorEvaluator, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } + #ifndef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + #else // following functions are required by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} + const Device& device() const{return m_device;} + #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -292,6 +307,8 @@ struct TensorEvaluator, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; + // required by sycl + const Device& m_device; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h new file mode 100644 index 000000000..90cbe004f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorArgMaxSycl.h + * \brief: + * TensorArgMaxSycl + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP +namespace Eigen { +namespace internal { + template + struct eval, Eigen::Dense> + { + typedef const TensorTupleReducerDeviceOp& type; + }; + + template + struct nested, 1, + typename eval >::type> + { + typedef TensorTupleReducerDeviceOp type; + }; + +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + + +}// end namespace internal +template +class TensorTupleReducerDeviceOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, + const int return_dim, + const StrideDims& strides, + const Index& stride_mod, const Index& stride_div) + :m_xpr(expr), m_return_dim(return_dim), m_strides(strides), m_stride_mod(stride_mod), m_stride_div(stride_div) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + int return_dim() const { return m_return_dim; } + + EIGEN_DEVICE_FUNC + const StrideDims& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const Index& stride_mod() const { return m_stride_mod; } + + EIGEN_DEVICE_FUNC + const Index& stride_div() const { return m_stride_div; } + + protected: + typename Eigen::internal::remove_all::type m_xpr; + const int m_return_dim; + const StrideDims& m_strides; + const Index m_stride_mod; + const Index m_stride_div; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, SyclKernelDevice> +{ + typedef TensorTupleReducerDeviceOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Index Scalar; + typedef Index CoeffReturnType; + typedef typename XprType::CoeffReturnType TupleType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) + : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), + m_stride_div(op.stride_div()){} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } +typedef typename MakeGlobalPointer::CoeffReturnType >::Type ptr_Dev_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_Dev_type data() const { return const_cast(m_impl.data()); } + +protected: + TensorEvaluator m_impl; + const int m_return_dim; + const StrideDims& m_strides; + const Index& m_stride_mod; + const Index& m_stride_div; +}; +} // end namespace Eigen +#endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index fcd7d4d00..5b4c3c5bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -11,7 +11,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /***************************************************************** - * TensorSyclConvertToDeviceExpression.h + * TensorTensorContractionsycl.h * * \brief: * TensorContractionsycl @@ -389,9 +389,9 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), KernelConstructor(lhs_functors, rhs_functors, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::SyclKernelDevice>(lhs_functors, rhs_functors, localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::SyclKernelDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 66ffd819f..5db16d559 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -45,7 +45,7 @@ EigenConvolutionKernel1D(internal::IndexMapper itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -103,7 +103,7 @@ EigenConvolutionKernel2D(internal::IndexMapper itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -173,7 +173,7 @@ EigenConvolutionKernel3D(internal::IndexMapper itemID) { typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); @@ -339,8 +339,8 @@ struct TensorEvaluator(cgh, m_inputImpl); - typedef cl::sycl::accessor OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); + typedef cl::sycl::accessor OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); typedef cl::sycl::accessor KernelAccessorType; KernelAccessorType kernel_acc= m_device. template get_sycl_accessor(cgh, m_kernel); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 964222a15..258218463 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -41,9 +41,8 @@ namespace Eigen { size_t m_i; size_t m_offset; }; - +template struct memsetkernelFunctor{ - typedef cl::sycl::accessor AccType; AccType m_acc; const ptrdiff_t buff_offset; const size_t m_rng, m_c; @@ -55,15 +54,19 @@ namespace Eigen { }; + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) + ///FIXME: Currently there is a bug in amd cpu OpenCL auto s= (*it).template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); + //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. + }else if((*it).is_gpu() && s.find("intel")!=std::string::npos){ + it=devices.erase(it); } else{ ++it; @@ -112,6 +115,154 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} +//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop +template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto host_acc= find_buffer(dst)->second. template get_access(); + ::memcpy(host_acc.get_pointer(), src, n); +} + +template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + // Assuming that the dst is the start of the destination pointer +auto it =find_buffer(src); +auto offset =static_cast(static_cast(src))- it->first; +offset/=sizeof(Index); +size_t rng, GRange, tileSize; +parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access(cgh); + auto dst_acc =dest_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); + +} + +EIGEN_STRONG_INLINE void synchronize() const { + std::lock_guard lock(mutex_); + m_queue.wait_and_throw(); //pass +} +EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled + std::lock_guard lock(mutex_); + m_queue.wait_and_throw(); //pass + +} + +template +EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize =static_cast(m_queue.get_device(). template get_info()); + auto s= m_queue.get_device().template get_info(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast(256), static_cast(tileSize)); + } + rng = n; + if (rng==0) rng=static_cast(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } +} + +/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels +template +EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } +} + + + +/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels +template +EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } +} + + +EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + std::lock_guard lock(mutex_); + return m_queue.get_device(). template get_info(); +// return stream_->deviceProperties().multiProcessorCount; +} +EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + std::lock_guard lock(mutex_); + return m_queue.get_device(). template get_info(); + +// return stream_->deviceProperties().maxThreadsPerBlock; +} +EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + std::lock_guard lock(mutex_); + // OpenCL doesnot have such concept + return 2;//sycl_queue().get_device(). template get_info(); +// return stream_->deviceProperties().maxThreadsPerMultiProcessor; +} +EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + std::lock_guard lock(mutex_); + return m_queue.get_device(). template get_info(); +// return stream_->deviceProperties().sharedMemPerBlock; +} /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key @@ -119,10 +270,10 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + std::lock_guard lock(mutex_); auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access().get_pointer(); buf.set_final_data(nullptr); - std::lock_guard lock(mutex_); buffer_map.insert(std::pair>(static_cast(ptr),buf)); return static_cast(ptr); } @@ -193,48 +344,13 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(sycl_queue().get_device(). template get_info()); - auto s= sycl_queue().get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } + m_queue_stream->parallel_for_setup(n, tileSize, rng, GRange); } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } + m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); } @@ -242,39 +358,8 @@ struct SyclDevice { /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); - } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } + m_queue_stream->parallel_for_setup(dim0, dim1, dim2, tileSize0, tileSize1, tileSize2, rng0, rng1, rng2, GRange0, GRange1, GRange2); + } /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { @@ -319,8 +404,7 @@ struct SyclDevice { /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); + m_queue_stream->memcpyHostToDevice(dst,src,n); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -329,21 +413,7 @@ struct SyclDevice { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast(static_cast(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); + m_queue_stream->memcpyDeviceToHost(dst,src,n); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -366,8 +436,9 @@ struct SyclDevice { :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + auto buf_acc = m_buf.template get_access(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); } }; @@ -403,14 +474,13 @@ struct SyclDevice { EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass + m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - + m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the // underlying stream device. @@ -418,8 +488,10 @@ struct SyclDevice { return m_queue_stream->ok(); } }; - - +// This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout. +// This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator +// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation +struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d6415817b..8516b37b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -193,7 +193,12 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifndef __SYCL_DEVICE_ONLY__ return loadConstant(m_data+index); +#else + CoeffReturnType tmp = m_data[index]; + return tmp; +#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d55..77c9c6c6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -124,7 +124,9 @@ template struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { + #ifndef __SYCL_DEVICE_ONLY__ if (&rhs == this) return *this; + #endif first = rhs.first; second = rhs.second; return *this; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c9c7acfdc..94899252b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -35,7 +35,7 @@ static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ auto aI =bufI.template get_access(h); - auto aOut =bufOut.template get_access(h); + auto aOut =bufOut.template get_access(h); typedef decltype(aI) InputAccessor; typedef decltype(aOut) OutputAccessor; typedef cl::sycl::accessor LocalAccessor; @@ -158,7 +158,7 @@ struct InnerReducer { typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor(cgh, output); + auto output_accessor = dev.template get_sycl_accessor(cgh, output); ptrdiff_t out_offset = dev.get_offset(output); Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4c1..3d6270614 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -32,6 +32,8 @@ struct MakeLocalPointer { namespace Eigen { + template class TensorTupleReducerDeviceOp; + template struct TensorEvaluator, SyclKernelDevice>; namespace TensorSycl { namespace internal { @@ -48,6 +50,13 @@ template struct GetType{ typedef T Type; }; +template struct ValueCondition { + static const size_t Res =X; +}; +template struct ValueCondition { + static const size_t Res =Y; +}; + } } } @@ -80,6 +89,9 @@ template struct GetType{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +// TensorArgMaxSycl.h +#include "TensorArgMaxSycl.h" + /// this is used for extracting tensor convolution #include "TensorConvolutionSycl.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 9476c0ea8..d6ac7b91f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -103,7 +103,7 @@ KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT /// specialisation of the \ref ConvertToDeviceExpression struct when the node types are TensorForcedEvalOp and TensorLayoutSwapOp -#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(CVQual, ExprNode)\ +#define KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(CVQual, ExprNode)\ template \ struct ConvertToDeviceExpression > {\ typedef CVQual ExprNode< typename ConvertToDeviceExpression::Type> Type;\ @@ -111,15 +111,17 @@ struct ConvertToDeviceExpression > {\ // TensorForcedEvalOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorForcedEvalOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorForcedEvalOp) // TensorLayoutSwapOp -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorLayoutSwapOp) -KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorLayoutSwapOp) -#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP - +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorLayoutSwapOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorLayoutSwapOp) +//TensorIndexTupleOp +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(const,TensorIndexTupleOp) +KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP(,TensorIndexTupleOp) +#undef KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAPINDEXTUPLEOP /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp #define KERNELBROKERCONVERTREDUCTION(CVQual)\ @@ -132,6 +134,18 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTTUPLEREDUCTION(CVQual)\ +template \ +struct ConvertToDeviceExpression > {\ + typedef CVQual TensorTupleReducerOp::Type> Type;\ +}; + +KERNELBROKERCONVERTTUPLEREDUCTION(const) +KERNELBROKERCONVERTTUPLEREDUCTION() +#undef KERNELBROKERCONVERTTUPLEREDUCTION + +//TensorSlicingOp #define KERNELBROKERCONVERTSLICEOP(CVQual)\ template\ struct ConvertToDeviceExpression >{\ @@ -142,7 +156,7 @@ KERNELBROKERCONVERTSLICEOP(const) KERNELBROKERCONVERTSLICEOP() #undef KERNELBROKERCONVERTSLICEOP - +//TensorStridingSlicingOp #define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ template\ struct ConvertToDeviceExpression >{\ @@ -153,7 +167,6 @@ KERNELBROKERCONVERTERSLICESTRIDEOP(const) KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp #define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ template \ @@ -164,9 +177,6 @@ KERNELBROKERCONVERTCHIPPINGOP(const) KERNELBROKERCONVERTCHIPPINGOP() #undef KERNELBROKERCONVERTCHIPPINGOP - - - /// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorImagePatchOp #define KERNELBROKERCONVERTIMAGEPATCHOP(CVQual)\ template\ @@ -188,8 +198,6 @@ KERNELBROKERCONVERTVOLUMEPATCHOP(const) KERNELBROKERCONVERTVOLUMEPATCHOP() #undef KERNELBROKERCONVERTVOLUMEPATCHOP - - } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index af4eb5f13..24cc23f45 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -65,7 +65,6 @@ CVQual PlaceHolder, N>, Params...>{\ : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())){}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP @@ -83,6 +82,7 @@ CVQual PlaceHolder &t)\ : expr(DeviceFixedSizeTensor::instantiate(utility::tuple::get(t))){}\ }; + TENSORMAPFIXEDSIZE(const) TENSORMAPFIXEDSIZE() #undef TENSORMAPFIXEDSIZE @@ -189,9 +189,6 @@ struct ExprConstructor, CVQual ASSIGN() #undef ASSIGN - - - /// specialisation of the \ref ExprConstructor struct when the node type is /// const TensorAssignOp #define CONVERSIONEXPRCONST(CVQual)\ @@ -252,8 +249,6 @@ FORCEDEVAL(const) FORCEDEVAL() #undef FORCEDEVAL - - #define TENSORCUSTOMUNARYOP(CVQual)\ template \ struct ExprConstructor,\ @@ -274,13 +269,6 @@ TENSORCUSTOMUNARYOP(const) TENSORCUSTOMUNARYOP() #undef TENSORCUSTOMUNARYOP -template struct ValueCondition { - static const size_t Res =X; -}; -template struct ValueCondition { - static const size_t Res =Y; -}; - /// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp #define SYCLREDUCTIONEXPR(CVQual)\ template \ @@ -299,6 +287,35 @@ SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is TensorTupleReducerOp +/// use reductionOp instead of the TensorTupleReducerOp in order to build the tensor map. Because the tensorMap is the output of Tensor ReductionOP. +#define SYCLTUPLEREDUCTIONEXPR(CVQual)\ +template \ +struct ExprConstructor,\ +CVQual PlaceHolder, N>, Params...> {\ + static const auto NumRedDims= TensorReductionOp , MakeGlobalPointer>::NumDimensions;\ + static const auto NumIndices= ValueCondition::Res;\ +static const int Layout =static_cast(Eigen::internal::traits, MakeGlobalPointer>>::Layout);\ + typedef CVQual TensorMap<\ + Tensor::CoeffReturnType,NumIndices, Layout, typename TensorTupleReducerOp::Index>,\ + Layout,\ + MakeGlobalPointer\ + > XprType;\ + typedef typename TensorEvaluator , SyclKernelDevice>::Dimensions InputDimensions;\ + static const int NumDims = Eigen::internal::array_size::value;\ + typedef array StrideDims;\ + typedef const TensorTupleReducerDeviceOp Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ + :expr(Type(XprType(ConvertToActualTypeSycl(typename XprType::CoeffReturnType, utility::tuple::get(t)), fd.dimensions()),\ + fd.return_dim(), fd.strides(), fd.stride_mod(), fd.stride_div())) {\ + }\ +}; + +SYCLTUPLEREDUCTIONEXPR(const) +SYCLTUPLEREDUCTIONEXPR() +#undef SYCLTUPLEREDUCTIONEXPR /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp @@ -319,15 +336,18 @@ CVQual PlaceHolder, N>, Params :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ }; +//TensorContractionOp SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp) SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp) +//TensorCustomBinaryOp SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp) SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp) #undef SYCLCONTRACTCONVCUSBIOPS - +//TensorSlicingOp #define SYCLSLICEOPEXPR(CVQual)\ template\ struct ExprConstructor , CVQual TensorSlicingOp, Params... >{\ @@ -344,7 +364,7 @@ SYCLSLICEOPEXPR(const) SYCLSLICEOPEXPR() #undef SYCLSLICEOPEXPR - +//TensorStridingSlicingOp #define SYCLSLICESTRIDEOPEXPR(CVQual)\ template\ struct ExprConstructor, CVQual TensorStridingSlicingOp, Params... >{\ @@ -361,6 +381,7 @@ SYCLSLICESTRIDEOPEXPR(const) SYCLSLICESTRIDEOPEXPR() #undef SYCLSLICESTRIDEOPEXPR +//TensorReshapingOp and TensorShufflingOp #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ template\ struct ExprConstructor , CVQual OPEXPR , Params... >{\ @@ -373,13 +394,15 @@ struct ExprConstructor , CVQual OPEXPR \ struct ExprConstructor , CVQual OPEXPR , Params... >{\ @@ -392,11 +415,11 @@ struct ExprConstructor , CVQual OPEXPR \ @@ -454,14 +477,12 @@ SYCLTENSORVOLUMEPATCHOPEXPR(const) SYCLTENSORVOLUMEPATCHOPEXPR() #undef SYCLTENSORVOLUMEPATCHOPEXPR - - -// TensorLayoutSwapOp -#define SYCLTENSORLAYOUTSWAPOPEXPR(CVQual)\ +// TensorLayoutSwapOp and TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(CVQual, ExprNode)\ template\ -struct ExprConstructor , CVQual TensorLayoutSwapOp, Params... >{\ +struct ExprConstructor , CVQual ExprNode, Params... >{\ typedef ExprConstructor my_xpr_type;\ - typedef CVQual TensorLayoutSwapOp Type;\ + typedef CVQual ExprNode Type;\ my_xpr_type xprExpr;\ Type expr;\ template \ @@ -469,10 +490,14 @@ struct ExprConstructor , CVQual TensorLa : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr) {}\ }; -SYCLTENSORLAYOUTSWAPOPEXPR(const) -SYCLTENSORLAYOUTSWAPOPEXPR() -#undef SYCLTENSORLAYOUTSWAPOPEXPR +//TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(const, TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR(, TensorIndexTupleOp) +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXPR /// template deduction for \ref ExprConstructor struct template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 5a6a8f4c5..fb95af59e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -147,7 +147,7 @@ SYCLFORCEDEVALEXTACC(const) SYCLFORCEDEVALEXTACC() #undef SYCLFORCEDEVALEXTACC - +//TensorCustomUnaryOp #define SYCLCUSTOMUNARYOPEXTACC(CVQual)\ template \ struct ExtractAccessor, Dev> > {\ @@ -160,7 +160,7 @@ SYCLCUSTOMUNARYOPEXTACC(const) SYCLCUSTOMUNARYOPEXTACC() #undef SYCLCUSTOMUNARYOPEXTACC - +//TensorCustomBinaryOp #define SYCLCUSTOMBINARYOPEXTACC(CVQual)\ template \ struct ExtractAccessor, Dev> > {\ @@ -172,9 +172,6 @@ SYCLCUSTOMBINARYOPEXTACC(const) SYCLCUSTOMBINARYOPEXTACC() #undef SYCLCUSTOMBIBARYOPEXTACC - - - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp #define SYCLEVALTOEXTACC(CVQual)\ template \ @@ -188,15 +185,19 @@ SYCLEVALTOEXTACC() #undef SYCLEVALTOEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ +#define SYCLREDUCTIONEXTACC(CVQual, ExprNode)\ template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ +struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; +// TensorReductionOp +SYCLREDUCTIONEXTACC(const,TensorReductionOp) +SYCLREDUCTIONEXTACC(,TensorReductionOp) -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() +// TensorTupleReducerOp +SYCLREDUCTIONEXTACC(const,TensorTupleReducerOp) +SYCLREDUCTIONEXTACC(,TensorTupleReducerOp) #undef SYCLREDUCTIONEXTACC /// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp @@ -206,14 +207,14 @@ template, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; - +//TensorContractionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +//TensorConvolutionOp SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) #undef SYCLCONTRACTIONCONVOLUTIONEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ @@ -252,7 +253,6 @@ SYCLTENSORCHIPPINGOPEXTACC(const) SYCLTENSORCHIPPINGOPEXTACC() #undef SYCLTENSORCHIPPINGOPEXTACC - // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorImagePatchOp. #define SYCLTENSORIMAGEPATCHOPEXTACC(CVQual)\ @@ -266,8 +266,6 @@ SYCLTENSORIMAGEPATCHOPEXTACC(const) SYCLTENSORIMAGEPATCHOPEXTACC() #undef SYCLTENSORIMAGEPATCHOPEXTACC - - // specialisation of the \ref ExtractAccessor struct when the node type is /// TensorVolumePatchOp. #define SYCLTENSORVOLUMEPATCHOPEXTACC(CVQual)\ @@ -281,21 +279,23 @@ SYCLTENSORVOLUMEPATCHOPEXTACC(const) SYCLTENSORVOLUMEPATCHOPEXTACC() #undef SYCLTENSORVOLUMEPATCHOPEXTACC - // specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorLayoutSwapOp. -#define SYCLTENSORLAYOUTSWAPOPEXTACC(CVQual)\ +/// TensorLayoutSwapOp, TensorIndexTupleOp +#define SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(CVQual, ExprNode)\ template\ -struct ExtractAccessor, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; -SYCLTENSORLAYOUTSWAPOPEXTACC(const) -SYCLTENSORLAYOUTSWAPOPEXTACC() -#undef SYCLTENSORLAYOUTSWAPOPEXTACC - +// TensorLayoutSwapOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorLayoutSwapOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorLayoutSwapOp) +//TensorIndexTupleOp +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(const,TensorIndexTupleOp) +SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC(,TensorIndexTupleOp) +#undef SYCLTENSORLAYOUTSWAPINDEXTUPLEOPEXTACC /// template deduction for \ref ExtractAccessor template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 9fcac5ecb..942e9d307 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -126,19 +126,19 @@ struct FunctorExtractor, Dev> Evaluator;\ DEFALTACTION(Evaluator)\ }; - +//TensorCustomUnaryOp SYCLEXTRFUNCCUSTOMUNARYOP(const) SYCLEXTRFUNCCUSTOMUNARYOP() #undef SYCLEXTRFUNCCUSTOMUNARYOP - +//TensorCustomBinaryOp #define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\ template \ struct FunctorExtractor, Dev> > {\ typedef TensorEvaluator, Dev> Evaluator;\ DEFALTACTION(Evaluator)\ }; - +//TensorCustomBinaryOp SYCLEXTRFUNCCUSTOMBIBARYOP(const) SYCLEXTRFUNCCUSTOMBIBARYOP() #undef SYCLEXTRFUNCCUSTOMBIBARYOP @@ -177,7 +177,7 @@ SYCLEXTRFUNCASSIGNOP() /// specialisation of the \ref FunctorExtractor struct when the node types are /// TensorEvalToOp, TensorLayoutSwapOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(CVQual, ExprNode)\ +#define SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(CVQual, ExprNode)\ template \ struct FunctorExtractor, Dev> > {\ FunctorExtractor > xprExpr;\ @@ -185,13 +185,16 @@ struct FunctorExtractor, Dev> > {\ : xprExpr(expr.impl()) {}\ }; //TensorEvalToOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorEvalToOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorEvalToOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorEvalToOp) // TensorLayoutSwapOp -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(const, TensorLayoutSwapOp) -SYCLEXTRFUNCEVALTOOPSWAPLAYOUT(, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorLayoutSwapOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorLayoutSwapOp) +// TensorIndexTupleOp +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(const, TensorIndexTupleOp) +SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE(, TensorIndexTupleOp) -#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUT +#undef SYCLEXTRFUNCEVALTOOPSWAPLAYOUTINDEXTUPLE template struct DimConstr { template @@ -202,10 +205,10 @@ template struct DimConstr { template static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast(dims.TotalSize()));} }; - +//TensorReductionOp #define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ template class MakePointer_, typename Device>\ -struct FunctorExtractor, Device>>{\ +struct FunctorExtractor, Device> >{\ typedef TensorEvaluator, Device> Evaluator;\ typedef typename Eigen::internal::conditional, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ @@ -213,12 +216,39 @@ struct FunctorExtractor, Device>& expr)\ : m_dimensions(DimConstr::getDim(expr.dimensions())) {}\ }; - - SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +//TensorTupleReducerOp +#define SYCLEXTRFUNCTUPLEREDUCTIONOP(CVQual)\ +template\ + struct FunctorExtractor, Device> >{\ + typedef TensorEvaluator, Device> Evaluator;\ + static const int NumOutputDims= Eigen::internal::traits >::NumDimensions;\ + typedef typename Evaluator::StrideDims StrideDims;\ + typedef typename Evaluator::Index Index;\ + typedef typename Eigen::internal::conditional, typename Evaluator::Dimensions >::type Dimensions;\ + const Dimensions m_dimensions;\ + const int m_return_dim;\ + const StrideDims m_strides;\ + const Index m_stride_mod;\ + const Index m_stride_div;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\ + FunctorExtractor(const TensorEvaluator, Device>& expr)\ + : m_dimensions(DimConstr::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ + m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ +}; + +SYCLEXTRFUNCTUPLEREDUCTIONOP(const) +SYCLEXTRFUNCTUPLEREDUCTIONOP() +#undef SYCLEXTRFUNCTUPLEREDUCTIONOP + +//TensorContractionOp and TensorConvolutionOp #define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ template\ struct FunctorExtractor, Device>>{\ @@ -230,9 +260,10 @@ struct FunctorExtractor\ struct FunctorExtractor, Dev> >{\ @@ -273,7 +305,7 @@ SYCLEXTRFUNCTSLICESTRIDEOP(const) SYCLEXTRFUNCTSLICESTRIDEOP() #undef SYCLEXTRFUNCTSLICESTRIDEOP -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory +// Had to separate TensorReshapingOp and TensorShufflingOp. Otherwise it will be mistaken by UnaryCategory #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ template\ struct FunctorExtractor, Dev> > {\ @@ -284,9 +316,11 @@ struct FunctorExtractor\ struct FunctorExtractor, Device> >{\ @@ -420,7 +455,6 @@ SYCLEXTRFUNCVOLUMEPATCHOP() #undef SYCLEXTRFUNCVOLUMEPATCHOP - /// template deduction function for FunctorExtractor template auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 12237bfab..e5b892f2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -72,7 +72,7 @@ namespace internal { template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - typedef cl::sycl::accessor write_accessor; + typedef cl::sycl::accessor write_accessor; ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { @@ -85,8 +85,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast(itemID.get_global_linear_id()); @@ -111,7 +111,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen class ReductionFunctor, Index> { public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - typedef cl::sycl::accessor write_accessor; + typedef cl::sycl::accessor write_accessor; typedef Eigen::internal::SumReducer Op; ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Eigen::internal::MeanReducer, Index range_, Index num_values_to_reduce_) @@ -126,8 +126,8 @@ class ReductionFunctor(device_expr.expr, dims, functor); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + typedef Eigen::TensorEvaluator DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=static_cast(itemID.get_global_linear_id()); @@ -173,7 +173,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); @@ -220,7 +220,7 @@ public: const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, op); /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::SyclKernelDevice()); /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); auto scale = (rng*red_factor) + remaining; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 330283b39..234580c7c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -114,27 +114,37 @@ SYCLCUSTOMBINARYOPLEAFCOUNT() #undef SYCLCUSTOMBINARYOPLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\ +#define EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(CVQual , ExprNode, Num)\ template \ struct LeafCount > {\ static const size_t Count = Num + CategoryCount::Count;\ }; -EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPLEAFCOUNT(, TensorEvalToOp, 1) -EVALTOLAYOUTSWAPLEAFCOUNT(const, TensorLayoutSwapOp, 0) -EVALTOLAYOUTSWAPLEAFCOUNT(, TensorLayoutSwapOp, 0) -#undef EVALTOLAYOUTSWAPLEAFCOUNT +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorEvalToOp, 1) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorLayoutSwapOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorLayoutSwapOp, 0) + +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(const, TensorIndexTupleOp, 0) +EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT(, TensorIndexTupleOp, 0) + +#undef EVALTOLAYOUTSWAPINDEXTUPLELEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual)\ +#define REDUCTIONLEAFCOUNT(CVQual, ExprNode)\ template \ -struct LeafCount > {\ +struct LeafCount > {\ static const size_t Count =1;\ }; -REDUCTIONLEAFCOUNT(const) -REDUCTIONLEAFCOUNT() +// TensorReductionOp +REDUCTIONLEAFCOUNT(const,TensorReductionOp) +REDUCTIONLEAFCOUNT(,TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +REDUCTIONLEAFCOUNT(const, TensorTupleReducerOp) +REDUCTIONLEAFCOUNT(, TensorTupleReducerOp) + #undef REDUCTIONLEAFCOUNT /// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp @@ -150,8 +160,6 @@ CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) #undef CONTRACTIONCONVOLUTIONLEAFCOUNT - - /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template \ @@ -161,7 +169,6 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT - /// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp #define CHIPPINGOPLEAFCOUNT(CVQual)\ template \ @@ -195,7 +202,6 @@ TENSORIMAGEPATCHOPLEAFCOUNT() template\ struct LeafCount >:CategoryCount{}; - TENSORVOLUMEPATCHOPLEAFCOUNT(const) TENSORVOLUMEPATCHOPLEAFCOUNT() #undef TENSORVOLUMEPATCHOPLEAFCOUNT diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 99d528963..9d5708fc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -171,19 +171,24 @@ CUSTOMBINARYOPEVAL() /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorEvalToOp, TensorLayoutSwapOp -#define EVALTOLAYOUTSWAP(CVQual, ExprNode)\ +/// TensoroOp, TensorLayoutSwapOp, and TensorIndexTupleOp +#define EVALTOLAYOUTSWAPINDEXTUPLE(CVQual, ExprNode)\ template \ struct PlaceHolderExpression, N> {\ typedef CVQual ExprNode::ArgType> Type;\ }; -EVALTOLAYOUTSWAP(const, TensorEvalToOp) -EVALTOLAYOUTSWAP(, TensorEvalToOp) -EVALTOLAYOUTSWAP(const, TensorLayoutSwapOp) -EVALTOLAYOUTSWAP(, TensorLayoutSwapOp) +// TensorEvalToOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorEvalToOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorEvalToOp) +//TensorLayoutSwapOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorLayoutSwapOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorLayoutSwapOp) +//TensorIndexTupleOp +EVALTOLAYOUTSWAPINDEXTUPLE(const, TensorIndexTupleOp) +EVALTOLAYOUTSWAPINDEXTUPLE(, TensorIndexTupleOp) -#undef EVALTOLAYOUTSWAP +#undef EVALTOLAYOUTSWAPINDEXTUPLE /// specialisation of the \ref PlaceHolderExpression when the node is @@ -199,17 +204,24 @@ CHIPPINGOP() #undef CHIPPINGOP /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLREDUCTION(CVQual)\ +/// TensorReductionOp and TensorTupleReducerOp (Argmax) +#define SYCLREDUCTION(CVQual, ExprNode)\ template \ -struct PlaceHolderExpression, N>{\ - typedef CVQual PlaceHolder, N> Type;\ +struct PlaceHolderExpression, N>{\ + typedef CVQual PlaceHolder, N> Type;\ }; -SYCLREDUCTION(const) -SYCLREDUCTION() + +// tensor reduction +SYCLREDUCTION(const, TensorReductionOp) +SYCLREDUCTION(, TensorReductionOp) + +// tensor Argmax -TensorTupleReducerOp +SYCLREDUCTION(const, TensorTupleReducerOp) +SYCLREDUCTION(, TensorTupleReducerOp) #undef SYCLREDUCTION + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index cac785540..29c78184d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,7 +25,6 @@ namespace Eigen { namespace TensorSycl { - template struct ExecExprFunctorKernel{ typedef typename internal::createPlaceHolderExpression::Type PlaceHolderExpr; @@ -38,7 +37,7 @@ template struct ExecEx void operator()(cl::sycl::nd_item<1> itemID) { typedef typename internal::ConvertToDeviceExpression::Type DevExpr; auto device_expr =internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::SyclKernelDevice()); typename DevExpr::Index gId = static_cast(itemID.get_global_linear_id()); if (gId < range) device_evaluator.evalScalar(gId); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 996178292..4a558f856 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp new file mode 100644 index 000000000..9b22f1eca --- /dev/null +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -0,0 +1,248 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template +static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ + + Tensor in(Eigen::array{{2,2,2}}); + Tensor out_max; + Tensor out_min; + in.setRandom(); + in *= in.constant(100.0); + in(0, 0, 0) = -1000.0; + in(1, 1, 1) = 1000.0; + + std::size_t in_bytes = in.size() * sizeof(DataType); + std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); + + DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out_max = static_cast(sycl_device.allocate(out_bytes)); + DenseIndex* d_out_min = static_cast(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array{{2,2,2}}); + Eigen::TensorMap > gpu_out_max(d_out_max); + Eigen::TensorMap > gpu_out_min(d_out_min); + sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes); + + gpu_out_max.device(sycl_device) = gpu_in.argmax(); + gpu_out_min.device(sycl_device) = gpu_in.argmin(); + + sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes); + sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes); + + VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1); + VERIFY_IS_EQUAL(out_min(), 0); + + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out_max); + sycl_device.deallocate(d_out_min); +} + + +template +static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) +{ + DenseIndex sizeDim0=9; + DenseIndex sizeDim1=3; + DenseIndex sizeDim2=5; + DenseIndex sizeDim3=7; + Tensor tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); + + std::vector dims; + dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + + array out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor tensor_arg(out_shape); + + array ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix)=(ix[dim] != 0)?-1.0:10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + + DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out= static_cast(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast(tensor_arg.size()), + size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + + + + + +template +static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) +{ + DenseIndex sizeDim0=9; + DenseIndex sizeDim1=3; + DenseIndex sizeDim2=5; + DenseIndex sizeDim3=7; + Tensor tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); + + std::vector dims; + dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + + array out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + + Tensor tensor_arg(out_shape); + + array ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 + tensor(ix)=(ix[dim] != 0)?1.0:-10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + + DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out= static_cast(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast(tensor_arg.size()), + size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + + + + +template void sycl_argmax_test_per_device(const Device_Selector& d){ + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_simple_argmax(sycl_device); + test_sycl_simple_argmax(sycl_device); + test_sycl_argmax_dim(sycl_device); + test_sycl_argmax_dim(sycl_device); + test_sycl_argmin_dim(sycl_device); + test_sycl_argmin_dim(sycl_device); +} + +void test_cxx11_tensor_argmax_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device(device)); + } +} From e2e3f785331cb90ae07b7ca7829be0ffecf6811b Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 7 Mar 2017 17:48:15 +0000 Subject: [PATCH 062/680] Fixing potential race condition on sycl device. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 508 +++++++++--------- 1 file changed, 262 insertions(+), 246 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 258218463..23297a0a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -41,6 +41,7 @@ namespace Eigen { size_t m_i; size_t m_offset; }; + template struct memsetkernelFunctor{ AccType m_acc; @@ -54,6 +55,21 @@ template }; +struct memsetCghFunctor{ + cl::sycl::buffer& m_buf; + const ptrdiff_t& buff_offset; + const size_t& rng , GRange, tileSize; + const int &c; + memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} + + void operator()(cl::sycl::handler &cgh) const { + auto buf_acc = m_buf.template get_access(cgh); + typedef decltype(buf_acc) AccType; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); + } +}; + //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); @@ -75,18 +91,8 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device return devices; } -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; +class QueueInterface { +public: /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. template explicit QueueInterface(const dev_Selector& s): @@ -115,155 +121,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} -//FIXME: currently we have to switch back to write as discard_write doesnot work in forloop -template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - auto host_acc= find_buffer(dst)->second. template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); -} - -template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - // Assuming that the dst is the start of the destination pointer -auto it =find_buffer(src); -auto offset =static_cast(static_cast(src))- it->first; -offset/=sizeof(Index); -size_t rng, GRange, tileSize; -parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - m_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - -} - -EIGEN_STRONG_INLINE void synchronize() const { - std::lock_guard lock(mutex_); - m_queue.wait_and_throw(); //pass -} -EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - std::lock_guard lock(mutex_); - m_queue.wait_and_throw(); //pass - -} - -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(m_queue.get_device(). template get_info()); - auto s= m_queue.get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } -} - -/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } -} - - - -/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels -template -EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); - } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } -} - - -EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); -// return stream_->deviceProperties().multiProcessorCount; -} -EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); - -// return stream_->deviceProperties().maxThreadsPerBlock; -} -EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - std::lock_guard lock(mutex_); - // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); -// return stream_->deviceProperties().maxThreadsPerMultiProcessor; -} -EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - std::lock_guard lock(mutex_); - return m_queue.get_device(). template get_info(); -// return stream_->deviceProperties().sharedMemPerBlock; -} - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we @@ -292,9 +149,233 @@ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { std::lock_guard lock(mutex_); buffer_map.clear(); } - - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop + /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device + /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode + /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the + /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that + /// this buffer is accessed, the data will be copied to the device. + template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto host_acc= find_buffer(dst)->second. template get_access(); + ::memcpy(host_acc.get_pointer(), src, n); + } + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl + /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the + /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination + /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data + /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back + /// to the cpu only once per function call. + template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { std::lock_guard lock(mutex_); + auto it =find_buffer(src); + auto offset =static_cast(static_cast(src))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc= it->second.template get_access(cgh); + auto dst_acc =dest_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + }); + synchronize(); + } + + /// the memcpy function + template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + std::lock_guard lock(mutex_); + auto it1 = find_buffer(static_cast(src)); + auto it2 = find_buffer(dst); + auto offset= (static_cast(static_cast(src))) - it1->first; + auto i= (static_cast(dst)) - it2->first; + offset/=sizeof(Index); + i/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc =it1->second.template get_access(cgh); + auto dst_acc =it2->second.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); + }); + synchronize(); + } + + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + std::lock_guard lock(mutex_); + size_t rng, GRange, tileSize; + parallel_for_setup(n, tileSize, rng, GRange); + auto it1 = find_buffer(static_cast(data)); + ptrdiff_t buff_offset= (static_cast(data)) - it1->first; + m_queue.submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); + synchronize(); + } + + /// Creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + /// the function then adds an entry by creating a sycl buffer for that particular pointer. + template EIGEN_STRONG_INLINE cl::sycl::accessor + get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { + std::lock_guard lock(mutex_); + return (find_buffer(ptr)->second.template get_access(cgh)); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + std::lock_guard lock(mutex_); + return find_buffer(ptr)->second; + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + std::lock_guard lock(mutex_); + return (static_cast(ptr))-(find_buffer(ptr)->first); + } + + EIGEN_STRONG_INLINE void synchronize() const { + m_queue.wait_and_throw(); //pass + } + + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// FIXME::does not pass. Temporarily disabled + m_queue.wait_and_throw(); //pass + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { + tileSize =static_cast(m_queue.get_device(). template get_info()); + auto s= m_queue.get_device().template get_info(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast(256), static_cast(tileSize)); + } + rng = n; + if (rng==0) rng=static_cast(1); + GRange=rng; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_queue.get_device(). template get_info(); + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_queue.get_device(). template get_info(); + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_queue.get_device(). template get_info(); + } + + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue;} + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + m_queue.wait_and_throw(); + } + return !exception_caught_; + } + + // destructor + ~QueueInterface() { buffer_map.clear(); } + +private: + /// class members: + bool exception_caught_ = false; + + mutable std::mutex mutex_; + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + /// sycl queue + mutable cl::sycl::queue m_queue; + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { auto it1 = buffer_map.find(static_cast(ptr)); if (it1 != buffer_map.end()){ return it1; @@ -308,37 +389,25 @@ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; abort(); } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - m_queue.wait_and_throw(); - } - return !exception_caught_; - } - - // destructor - ~QueueInterface() { buffer_map.clear(); } }; +// Here is a sycl deviuce struct which accept the sycl queue interface +// as an input struct SyclDevice { // class member. QueueInterface* m_queue_stream; /// QueueInterface is not owned. it is the caller's responsibility to destroy it. explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} - /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. + // get sycl accessor template EIGEN_STRONG_INLINE cl::sycl::accessor get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access(cgh)); + return m_queue_stream->template get_sycl_accessor(cgh, ptr); } /// Accessing the created sycl device buffer for the device pointer EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + return m_queue_stream->get_sycl_buffer(ptr); } /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels @@ -353,8 +422,6 @@ struct SyclDevice { m_queue_stream->parallel_for_setup(dim0, dim1, tileSize0, tileSize1, rng0, rng1, GRange0, GRange1); } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { @@ -375,72 +442,27 @@ struct SyclDevice { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + m_queue_stream->memcpy(dst,src,n); } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - auto it = m_queue_stream->find_buffer(ptr); - return (static_cast(ptr))-it->first; + return m_queue_stream->get_offset(ptr); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. +// memcpyHostToDevice template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { m_queue_stream->memcpyHostToDevice(dst,src,n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. +/// here is the memcpyDeviceToHost template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { m_queue_stream->memcpyDeviceToHost(dst,src,n); } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} /// Here is the implementation of memset function on sycl. EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - auto it1 = m_queue_stream->find_buffer(static_cast(data)); - ptrdiff_t buff_offset= (static_cast(data)) - it1->first; - sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c )); - synchronize(); + m_queue_stream->memset(data,c,n); } - - struct memsetCghFunctor{ - cl::sycl::buffer& m_buf; - const ptrdiff_t& buff_offset; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - typedef decltype(buf_acc) AccType; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c)); - } - }; + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->sycl_queue();} EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME @@ -449,37 +471,31 @@ struct SyclDevice { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. + // there is no l3 cache on sycl devices. return firstLevelCacheSize(); } EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().multiProcessorCount; + return m_queue_stream->getNumSyclMultiProcessors(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info(); - - // return stream_->deviceProperties().maxThreadsPerBlock; + return m_queue_stream->maxSyclThreadsPerBlock(); } EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); + return m_queue_stream->maxSyclThreadsPerMultiProcessor(); // return stream_->deviceProperties().maxThreadsPerMultiProcessor; } EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().sharedMemPerBlock; + return m_queue_stream->sharedMemPerBlock(); } /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return m_queue_stream->majorDeviceVersion(); } EIGEN_STRONG_INLINE void synchronize() const { m_queue_stream->synchronize(); //pass } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled m_queue_stream->asynchronousExec(); } // This function checks if the runtime recorded an error for the From 5e9a1e7a7a7eccbb20a2c4eb44141727b0943f11 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 8 Mar 2017 14:17:48 +0000 Subject: [PATCH 063/680] Adding sycl Benchmarks. --- bench/tensors/README | 10 +- bench/tensors/tensor_benchmarks.h | 106 +++++++++++++++--- bench/tensors/tensor_benchmarks_sycl.cc | 73 ++++++++++-- .../tensor_benchmarks_sycl_include_headers.cc | 2 + .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 33 +++--- 5 files changed, 184 insertions(+), 40 deletions(-) create mode 100644 bench/tensors/tensor_benchmarks_sycl_include_headers.cc diff --git a/bench/tensors/README b/bench/tensors/README index 3a5fdbe17..c4b742749 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -14,8 +14,12 @@ nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -D last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu -To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code): +To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code): 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. -{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc +{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 2. The host compilation pass that generates the final host binary. -clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl +clang++ -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o +clang++ tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl +export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib +3. Run the benchmark +./tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index c2fb3dede..325026113 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -35,6 +35,11 @@ template class BenchmarkSuite { void memcpy(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); @@ -55,7 +60,11 @@ template class BenchmarkSuite { } const TensorMap, Eigen::Aligned> A((int*)a_, sizes); TensorMap, Eigen::Aligned> B(b_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.template cast(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.template cast(); @@ -70,7 +79,6 @@ template class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); - StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -93,7 +101,18 @@ template class BenchmarkSuite { const Eigen::DSizes second_quadrant(0, m_/2); const Eigen::DSizes third_quadrant(m_/2, 0); const Eigen::DSizes fourth_quadrant(m_/2, m_/2); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.slice(first_quadrant, quarter_sizes).device(device_) = @@ -118,7 +137,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % k_, 0); @@ -135,7 +158,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % n_, 1); @@ -158,7 +185,11 @@ template class BenchmarkSuite { Eigen::array shuffle; shuffle[0] = 1; shuffle[1] = 0; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.shuffle(shuffle); @@ -186,7 +217,11 @@ template class BenchmarkSuite { paddings[0] = Eigen::IndexPair(0, 0); paddings[1] = Eigen::IndexPair(2, 1); #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.pad(paddings); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.pad(paddings); @@ -216,6 +251,11 @@ template class BenchmarkSuite { Eigen::IndexList, Eigen::type2index<2> > strides; #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.stride(strides); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.stride(strides); @@ -245,6 +285,11 @@ template class BenchmarkSuite { broadcast.set(1, n_); #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.broadcast(broadcast); @@ -261,7 +306,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); @@ -280,6 +329,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl +for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); +} +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); @@ -297,7 +351,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.exp() + B.log(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.exp() + B.log(); @@ -325,7 +383,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -355,7 +417,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -375,7 +441,11 @@ template class BenchmarkSuite { Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(); @@ -404,7 +474,11 @@ template class BenchmarkSuite { typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); @@ -430,7 +504,11 @@ template class BenchmarkSuite { Eigen::array dims; dims[0] = 0; dims[1] = 1; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.convolve(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index 6df190869..cb6daac15 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -1,20 +1,73 @@ -#define EIGEN_USE_SYCL +#ifdef EIGEN_USE_SYCL #include #include #include "tensor_benchmarks.h" -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); +BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); +BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); +#endif diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc new file mode 100644 index 000000000..4b3110b85 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc @@ -0,0 +1,2 @@ +#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.cc" +#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.sycl" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 23297a0a7..5fa7423d0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -149,16 +149,27 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { std::lock_guard lock(mutex_); buffer_map.clear(); } - //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that + /// pointer created as a key we find the sycl buffer and get the host accessor with write mode + /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. + /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark + /// Also, this is faster as it uses the map_allocator instead of memcpy template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); - auto host_acc= find_buffer(dst)->second. template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); + auto it =find_buffer(dst); + auto offset =static_cast(static_cast(dst))- it->first; + offset/=sizeof(Index); + size_t rng, GRange, tileSize; + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); + auto src_buf = cl::sycl::buffer >(static_cast(static_cast(const_cast(src))), cl::sycl::range<1>(n)); + m_queue.submit([&](cl::sycl::handler &cgh) { + auto dst_acc= it->second.template get_access(cgh); + auto src_acc =src_buf.template get_access(cgh); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, offset, 0)); + }); + synchronize(); } /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the @@ -167,7 +178,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); auto it =find_buffer(src); auto offset =static_cast(static_cast(src))- it->first; offset/=sizeof(Index); @@ -186,7 +196,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - std::lock_guard lock(mutex_); auto it1 = find_buffer(static_cast(src)); auto it2 = find_buffer(dst); auto offset= (static_cast(static_cast(src))) - it1->first; @@ -206,7 +215,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - std::lock_guard lock(mutex_); size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); auto it1 = find_buffer(static_cast(data)); @@ -220,18 +228,15 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// the function then adds an entry by creating a sycl buffer for that particular pointer. template EIGEN_STRONG_INLINE cl::sycl::accessor get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - std::lock_guard lock(mutex_); return (find_buffer(ptr)->second.template get_access(cgh)); } /// Accessing the created sycl device buffer for the device pointer EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - std::lock_guard lock(mutex_); return find_buffer(ptr)->second; } EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - std::lock_guard lock(mutex_); return (static_cast(ptr))-(find_buffer(ptr)->first); } @@ -375,7 +380,9 @@ private: mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + std::lock_guard lock(mutex_); auto it1 = buffer_map.find(static_cast(ptr)); if (it1 != buffer_map.end()){ return it1; From 970ff78294503896940fb760d948f1eed156250f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Mar 2017 16:16:53 +0100 Subject: [PATCH 064/680] bug #1401: fix compilation of "cond ? x : -x" with x an AutoDiffScalar --- unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 50fedf6ac..d2808860c 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -108,7 +108,9 @@ class AutoDiffScalar template AutoDiffScalar(const AutoDiffScalar& other #ifndef EIGEN_PARSED_BY_DOXYGEN - , typename internal::enable_if::type>::Scalar>::value,void*>::type = 0 + , typename internal::enable_if< + internal::is_same::type>::Scalar>::value + && internal::is_convertible::value , void*>::type = 0 #endif ) : m_value(other.value()), m_derivatives(other.derivatives()) From aadb7405a7362ce0160d8ecb3843dc33a59e809a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 8 Mar 2017 18:20:06 +0000 Subject: [PATCH 065/680] Fixing typo in sycl Benchmark. --- bench/tensors/README | 4 ++-- bench/tensors/tensor_benchmarks_sycl_include_headers.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/tensors/README b/bench/tensors/README index c4b742749..69342cc9c 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -18,8 +18,8 @@ To compile and run the benchmark for SYCL, using ComputeCpp you currently need f 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. {ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 2. The host compilation pass that generates the final host binary. -clang++ -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o -clang++ tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl +clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o +clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib 3. Run the benchmark ./tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc index 4b3110b85..bcc3c4c79 100644 --- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc +++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc @@ -1,2 +1,2 @@ -#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.cc" -#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.sycl" +#include "tensor_benchmarks_sycl.cc" +#include "tensor_benchmarks_sycl.sycl" From 1b32a10053a942b1c6010afd719b44393b115d42 Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Wed, 8 Mar 2017 18:26:34 +0000 Subject: [PATCH 066/680] Use name to distinguish name instead of the vendor --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 5fa7423d0..ed21d7b56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -76,7 +76,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device std::vector::iterator it =devices.begin(); while(it!=devices.end()) { ///FIXME: Currently there is a bug in amd cpu OpenCL - auto s= (*it).template get_info(); + auto s= (*it).template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); From 344c2694a64494721e6d36d1197bde47c7d12af9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 9 Mar 2017 15:41:03 -0800 Subject: [PATCH 067/680] Make the non-blocking threadpool more flexible and less wasteful of CPU cycles for high-latency use-cases. * Adds a hint to ThreadPool allowing us to turn off spin waiting. Currently each reader and record yielder op in a graph creates a threadpool with a thread that spins for 1000 iterations through the work stealing loop before yielding. This is wasteful for such ops that process I/O. * This also changes the number of iterations through the steal loop to be inversely proportional to the number of threads. Since the time of each iteration is proportional to the number of threads, this yields roughly a constant spin time. * Implement a separate worker loop for the num_threads == 1 case since there is no point in going through the expensive steal loop. Moreover, since Steal() calls PopBack() on the victim queues it might reverse the order in which ops are executed, compared to the order in which they are scheduled, which is usually counter-productive for the types of I/O workloads the single thread pools tend to be used for. * Store num_threads in a member variable for simplicity and to avoid a data race between the thread creation loop and worker threads calling threads_.size(). --- .../src/ThreadPool/NonBlockingThreadPool.h | 163 ++++++++++++------ .../test/cxx11_non_blocking_thread_pool.cpp | 7 +- 2 files changed, 111 insertions(+), 59 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index ed1a761b6..e28afedb4 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -20,7 +20,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef RunQueue Queue; NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), + : num_threads_(num_threads), + allow_spinning_(true), + env_(env), threads_(num_threads), queues_(num_threads), coprimes_(num_threads), @@ -30,34 +32,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { done_(false), cancelled_(false), ec_(waiters_) { - waiters_.resize(num_threads); + Init(); + } - // Calculate coprimes of num_threads. - // Coprimes are used for a random walk over all threads in Steal - // and NonEmptyQueueIndex. Iteration is based on the fact that if we take - // a walk starting thread index t and calculate num_threads - 1 subsequent - // indices as (t + coprime) % num_threads, we will cover all threads without - // repetitions (effectively getting a presudo-random permutation of thread - // indices). - for (int i = 1; i <= num_threads; i++) { - unsigned a = i; - unsigned b = num_threads; - // If GCD(a, b) == 1, then a and b are coprimes. - while (b != 0) { - unsigned tmp = a; - a = b; - b = tmp % b; - } - if (a == 1) { - coprimes_.push_back(i); - } - } - for (int i = 0; i < num_threads; i++) { - queues_.push_back(new Queue()); - } - for (int i = 0; i < num_threads; i++) { - threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); - } + NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, + Environment env = Environment()) + : num_threads_(num_threads), + allow_spinning_(allow_spinning), + env_(env), + threads_(num_threads), + queues_(num_threads), + coprimes_(num_threads), + waiters_(num_threads), + blocked_(0), + spinning_(0), + done_(false), + cancelled_(false), + ec_(waiters_) { + Init(); } ~NonBlockingThreadPoolTempl() { @@ -77,8 +69,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } // Join threads explicitly to avoid destruction order issues. - for (size_t i = 0; i < threads_.size(); i++) delete threads_[i]; - for (size_t i = 0; i < threads_.size(); i++) delete queues_[i]; + for (size_t i = 0; i < num_threads_; i++) delete threads_[i]; + for (size_t i = 0; i < num_threads_; i++) delete queues_[i]; } void Schedule(std::function fn) { @@ -125,7 +117,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } int NumThreads() const final { - return static_cast(threads_.size()); + return num_threads_; } int CurrentThreadId() const final { @@ -149,6 +141,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { }; Environment env_; + const int num_threads_; + const bool allow_spinning_; MaxSizeVector threads_; MaxSizeVector queues_; MaxSizeVector coprimes_; @@ -159,6 +153,37 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { std::atomic cancelled_; EventCount ec_; + void Init() { + waiters_.resize(num_threads_); + + // Calculate coprimes of num_threads_. + // Coprimes are used for a random walk over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a walk starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + for (int i = 1; i <= num_threads_; i++) { + unsigned a = i; + unsigned b = num_threads_; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes_.push_back(i); + } + } + for (int i = 0; i < num_threads_; i++) { + queues_.push_back(new Queue()); + } + for (int i = 0; i < num_threads_; i++) { + threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } + } + // Main worker thread loop. void WorkerLoop(int thread_id) { PerThread* pt = GetPerThread(); @@ -167,36 +192,62 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { pt->thread_id = thread_id; Queue* q = queues_[thread_id]; EventCount::Waiter* waiter = &waiters_[thread_id]; - while (!cancelled_) { - Task t = q->PopFront(); - if (!t.f) { - t = Steal(); + // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional + // to num_threads_ and we assume that new work is scheduled at a + // constant rate, so we set spin_count to 5000 / num_threads_. The + // constant was picked based on a fair dice roll, tune it. + const int spin_count = + allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0; + if (num_threads_ == 1) { + // For num_threads_ == 1 there is no point in going through the expensive + // steal loop. Moreover, since Steal() calls PopBack() on the victim + // queues it might reverse the order in which ops are executed compared to + // the order in which they are scheduled, which tends to be + // counter-productive for the types of I/O workloads the single thread + // pools tend to be used for. + while (!cancelled_) { + Task t = q->PopFront(); + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = q->PopFront(); + } + } if (!t.f) { - // Leave one thread spinning. This reduces latency. - // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it. - // Also, the time it takes to attempt to steal work 1000 times depends - // on the size of the thread pool. However the speed at which the user - // of the thread pool submit tasks is independent of the size of the - // pool. Consider a time based limit instead. - if (!spinning_ && !spinning_.exchange(true)) { - for (int i = 0; i < 1000 && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = Steal(); - } else { + if (!WaitForWork(waiter, &t)) { + return; + } + } + if (t.f) { + env_.ExecuteTask(t); + } + } + } else { + while (!cancelled_) { + Task t = q->PopFront(); + if (!t.f) { + t = Steal(); + if (!t.f) { + // Leave one thread spinning. This reduces latency. + if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = Steal(); + } else { + return; + } + } + spinning_ = false; + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { return; } } - spinning_ = false; - } - if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; - } } } - } - if (t.f) { - env_.ExecuteTask(t); + if (t.f) { + env_.ExecuteTask(t); + } } } } @@ -244,7 +295,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { // If we are shutting down and all worker threads blocked without work, // that's we are done. blocked_++; - if (done_ && blocked_ == threads_.size()) { + if (done_ && blocked_ == num_threads_) { ec_.CancelWait(waiter); // Almost done, but need to re-check queues. // Consider that all queues are empty and all worker threads are preempted diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 2c5765ce4..48cd2d4e4 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -23,11 +23,11 @@ static void test_create_destroy_empty_pool() } -static void test_parallelism() +static void test_parallelism(bool allow_spinning) { // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 - NonBlockingThreadPool tp(kThreads); + NonBlockingThreadPool tp(kThreads, allow_spinning); VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { @@ -119,6 +119,7 @@ static void test_cancel() void test_cxx11_non_blocking_thread_pool() { CALL_SUBTEST(test_create_destroy_empty_pool()); - CALL_SUBTEST(test_parallelism()); + CALL_SUBTEST(test_parallelism(true)); + CALL_SUBTEST(test_parallelism(false)); CALL_SUBTEST(test_cancel()); } From d56ab01094d30cf57150b454b8ebaf20ac7bc85e Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 10 Mar 2017 08:30:22 -0800 Subject: [PATCH 068/680] Use C++11 ctor forwarding to simplify code a bit. --- .../src/ThreadPool/NonBlockingThreadPool.h | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index e28afedb4..4fbd3af1e 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -19,22 +19,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef typename Environment::Task Task; typedef RunQueue Queue; - NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : num_threads_(num_threads), - allow_spinning_(true), - env_(env), - threads_(num_threads), - queues_(num_threads), - coprimes_(num_threads), - waiters_(num_threads), - blocked_(0), - spinning_(0), - done_(false), - cancelled_(false), - ec_(waiters_) { - Init(); - } - NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) : num_threads_(num_threads), @@ -52,6 +36,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { Init(); } + NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) + : NonBlockingThreadPoolTempl(num_threads, true, env) {} + ~NonBlockingThreadPoolTempl() { done_ = true; From bfd7bf9c5b4ebc932a339200bb4a455eadcf6d28 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 10 Mar 2017 08:48:20 -0800 Subject: [PATCH 069/680] Get rid of Init(). --- .../src/ThreadPool/NonBlockingThreadPool.h | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 4fbd3af1e..9dcc9dab7 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -19,6 +19,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef typename Environment::Task Task; typedef RunQueue Queue; + NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) + : NonBlockingThreadPoolTempl(num_threads, true, env) {} + NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) : num_threads_(num_threads), @@ -33,11 +36,35 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { done_(false), cancelled_(false), ec_(waiters_) { - Init(); - } + waiters_.resize(num_threads_); - NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : NonBlockingThreadPoolTempl(num_threads, true, env) {} + // Calculate coprimes of num_threads_. + // Coprimes are used for a random walk over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a walk starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + for (int i = 1; i <= num_threads_; i++) { + unsigned a = i; + unsigned b = num_threads_; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes_.push_back(i); + } + } + for (int i = 0; i < num_threads_; i++) { + queues_.push_back(new Queue()); + } + for (int i = 0; i < num_threads_; i++) { + threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); + } + } ~NonBlockingThreadPoolTempl() { done_ = true; @@ -140,37 +167,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { std::atomic cancelled_; EventCount ec_; - void Init() { - waiters_.resize(num_threads_); - - // Calculate coprimes of num_threads_. - // Coprimes are used for a random walk over all threads in Steal - // and NonEmptyQueueIndex. Iteration is based on the fact that if we take - // a walk starting thread index t and calculate num_threads - 1 subsequent - // indices as (t + coprime) % num_threads, we will cover all threads without - // repetitions (effectively getting a presudo-random permutation of thread - // indices). - for (int i = 1; i <= num_threads_; i++) { - unsigned a = i; - unsigned b = num_threads_; - // If GCD(a, b) == 1, then a and b are coprimes. - while (b != 0) { - unsigned tmp = a; - a = b; - b = tmp % b; - } - if (a == 1) { - coprimes_.push_back(i); - } - } - for (int i = 0; i < num_threads_; i++) { - queues_.push_back(new Queue()); - } - for (int i = 0; i < num_threads_; i++) { - threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); - } - } - // Main worker thread loop. void WorkerLoop(int thread_id) { PerThread* pt = GetPerThread(); From f499fe9496e7c5e6f70d4bdcfb6ed9088795431a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 13 Mar 2017 09:18:37 +0000 Subject: [PATCH 070/680] Adding synchronisation to convolution kernel for sycl backend. --- bench/tensors/tensor_benchmarks.h | 5 +++++ unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 1 + 2 files changed, 6 insertions(+) diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 325026113..3a640ede4 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -539,6 +539,11 @@ for (int iter = 0; iter < 10; ++iter) { if (Eigen::internal::is_same::value) { device_.synchronize(); } +#elif defined(EIGEN_USE_SYCL) + if (Eigen::internal::is_same::value) { + device_.synchronize(); + } + #endif StopBenchmarkTiming(); SetBenchmarkFlopsProcessed(num_items); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 5db16d559..2e6021b1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -425,6 +425,7 @@ struct TensorEvaluator Date: Tue, 14 Mar 2017 14:16:53 -0700 Subject: [PATCH 071/680] Made the reduction code compile with cuda-clang --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c841786b8..e341e2e9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,6 +11,17 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && defined(__CUDA__) +#define KERNEL_FRIEND friend __global__ +#else +#define KERNEL_FRIEND friend +#endif +#endif + + namespace Eigen { @@ -681,15 +692,15 @@ struct TensorEvaluator, template friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 - template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif - template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif #if defined(EIGEN_USE_SYCL) From 61160a21d23880749bce3b0d630d9880f70af6e5 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 15 Mar 2017 06:57:25 -0400 Subject: [PATCH 072/680] ARM prefetch fixes: Implement prefetch on ARM64. Do not clobber cc on ARM32. --- Eigen/src/Core/arch/NEON/PacketMath.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 84a56bdcc..aede4a6d5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -51,14 +51,17 @@ typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function -// which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC +#if EIGEN_ARCH_ARM64 + // __builtin_prefetch tends to do nothing on ARM64 compilers because the + // prefetch instructions there are too detailed for __builtin_prefetch to map + // meaningfully to them. + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !EIGEN_ARCH_ARM64 - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching #define EIGEN_ARM_PREFETCH(ADDR) From 89fd0c38812b024734eeacf9c23ed3714c8b0f93 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 15 Mar 2017 15:18:03 +0100 Subject: [PATCH 073/680] better check array index before using it --- Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 933cd564b..da85b4d6e 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; /* get pivot column from head of minimum degree list */ - while (head [min_score] == COLAMD_EMPTY && min_score < n_col) + while (min_score < n_col && head [min_score] == COLAMD_EMPTY) { min_score++ ; } From c06861d15e16a8f6241d4e409ef0136f3277350f Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Wed, 15 Mar 2017 19:26:08 +0000 Subject: [PATCH 074/680] Fixes bug in get_sycl_supported_devices() that was reporting unsupported Intel CPU on AMD platform - causing timeouts in that configuration --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index ed21d7b56..e9c3dc0a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -76,13 +76,16 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device std::vector::iterator it =devices.begin(); while(it!=devices.end()) { ///FIXME: Currently there is a bug in amd cpu OpenCL - auto s= (*it).template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it=devices.erase(it); + auto name = (*it).template get_info(); + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + auto vendor = (*it).template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + + if((*it).is_cpu() && vendor.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs + it = devices.erase(it); //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. - }else if((*it).is_gpu() && s.find("intel")!=std::string::npos){ - it=devices.erase(it); + }else if((*it).is_gpu() && name.find("intel")!=std::string::npos){ + it = devices.erase(it); } else{ ++it; From 9597d6f6aba6091f986fbe2348106dc96a24e34e Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Wed, 15 Mar 2017 19:28:09 +0000 Subject: [PATCH 075/680] Temporary: Disables cxx11_tensor_argmax_sycl test since it is causing zombie thread --- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index 9b22f1eca..b80cb8f06 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -242,7 +242,9 @@ template void sycl_argmax_test_per_ } void test_cxx11_tensor_argmax_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { +// TODO {lukei}: re-enable once fixed +/* for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_argmax_test_per_device(device)); } +*/ } From fd7db52f9b3b1227719c6d8281ff5e5198aaae82 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Mar 2017 20:02:39 -0700 Subject: [PATCH 076/680] Silenced compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index f335edf7d..c46a778b5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -50,6 +50,7 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { From aae19c70ac273a2d40daf18a3cd15c0b0075662b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 17 Mar 2017 17:33:15 +0100 Subject: [PATCH 077/680] update has_ReturnType to be more consistent with other has_ helpers --- Eigen/src/Core/util/Meta.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 90eda6e70..8de605500 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -433,10 +433,10 @@ struct meta_no { char a[2]; }; template struct has_ReturnType { - template static meta_yes testFunctor(typename C::ReturnType const *); - template static meta_no testFunctor(...); + template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); + template static meta_no testFunctor(...); - enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; }; template const T* return_ptr(); From a91417a7a5a210f424b8cfec4b2bc1e00aa340be Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Mon, 20 Mar 2017 14:48:54 +0000 Subject: [PATCH 078/680] Introduces align allocator for SYCL buffer --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e9c3dc0a0..c5142b7c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,6 +15,17 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +template > +struct SyclAllocator { + typedef Scalar value_type; + typedef typename std::allocator_traits::pointer pointer; + typedef typename std::allocator_traits::size_type size_type; + + SyclAllocator( ){}; + Scalar* allocate(std::size_t elements) { return static_cast(aligned_alloc(Align, elements)); } + void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } +}; + namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) @@ -56,11 +67,11 @@ template }; struct memsetCghFunctor{ - cl::sycl::buffer& m_buf; + cl::sycl::buffer >& m_buf; const ptrdiff_t& buff_offset; const size_t& rng , GRange, tileSize; const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) + memsetCghFunctor(cl::sycl::buffer >& buff, const ptrdiff_t& buff_offset_, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} void operator()(cl::sycl::handler &cgh) const { @@ -124,6 +135,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { })) #endif {} + /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we @@ -131,10 +143,10 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { std::lock_guard lock(mutex_); - auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); + auto buf = cl::sycl::buffer >(cl::sycl::range<1>(num_bytes)); auto ptr =buf.get_access().get_pointer(); buf.set_final_data(nullptr); - buffer_map.insert(std::pair>(static_cast(ptr),buf)); + buffer_map.insert(std::pair > >(static_cast(ptr),buf)); return static_cast(ptr); } @@ -235,7 +247,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { return find_buffer(ptr)->second; } @@ -380,18 +392,18 @@ private: /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; + mutable std::map > > buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + EIGEN_STRONG_INLINE std::map > >::iterator find_buffer(const void* ptr) const { std::lock_guard lock(mutex_); auto it1 = buffer_map.find(static_cast(ptr)); if (it1 != buffer_map.end()){ return it1; } else{ - for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + for(std::map > >::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ auto size = it->second.get_size(); if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; } @@ -416,7 +428,7 @@ struct SyclDevice { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + EIGEN_STRONG_INLINE cl::sycl::buffer >& get_sycl_buffer(const void * ptr) const { return m_queue_stream->get_sycl_buffer(ptr); } From 511810797e35471568091c44e947329f0060698e Mon Sep 17 00:00:00 2001 From: Simon Praetorius Date: Fri, 24 Mar 2017 17:45:56 +0100 Subject: [PATCH 079/680] Issue with mpreal and std::numeric_limits, i.e. digits is not a constant. Added a digits() traits in NumTraits with fallback to static constant. Specialization for mpreal added in MPRealSupport. --- Eigen/src/Core/NumTraits.h | 34 +++++++++++++++++++++++++++++++++ Eigen/src/Core/StableNorm.h | 2 +- unsupported/Eigen/MPRealSupport | 3 +++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index aebc0c259..4d896a098 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -41,6 +41,34 @@ struct default_digits10_impl // Integer static int run() { return 0; } }; + +// default implementation of digits(), based on numeric_limits if specialized, +// 0 for integer types, and log2(epsilon()) otherwise. +template< typename T, + bool use_numeric_limits = std::numeric_limits::is_specialized, + bool is_integer = NumTraits::IsInteger> +struct default_digits_impl +{ + static int run() { return std::numeric_limits::digits; } +}; + +template +struct default_digits_impl // Floating point +{ + static int run() { + using std::log; + using std::ceil; + typedef typename NumTraits::Real Real; + return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); + } +}; + +template +struct default_digits_impl // Integer +{ + static int run() { return 0; } +}; + } // end namespace internal /** \class NumTraits @@ -118,6 +146,12 @@ template struct GenericNumTraits return internal::default_digits10_impl::run(); } + EIGEN_DEVICE_FUNC + static inline int digits() + { + return internal::default_digits_impl::run(); + } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index be04ed44d..4ea598ba8 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -74,7 +74,7 @@ blueNorm_impl(const EigenBase& _vec) // are used. For any specific computer, each of the assignment // statements can be replaced ibeta = std::numeric_limits::radix; // base for floating-point numbers - it = std::numeric_limits::digits; // number of base-beta digits in mantissa + it = NumTraits::digits(); // number of base-beta digits in mantissa iemin = std::numeric_limits::min_exponent; // minimum exponent iemax = std::numeric_limits::max_exponent; // maximum exponent rbig = (std::numeric_limits::max)(); // largest floating-point number diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport index 7f0b70c63..b770abc4a 100644 --- a/unsupported/Eigen/MPRealSupport +++ b/unsupported/Eigen/MPRealSupport @@ -90,6 +90,9 @@ int main() #ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits::digits10(Precision); } static inline int digits10 (const Real& x) { return std::numeric_limits::digits10(x); } + + static inline int digits () { return std::numeric_limits::digits(); } + static inline int digits (const Real& x) { return std::numeric_limits::digits(x); } #endif static inline Real dummy_precision() From bd64ee8555559ee13f02f2921594b4bd224f9d00 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 28 Mar 2017 16:50:34 +0100 Subject: [PATCH 080/680] Fixing TensorArgMaxSycl.h; Removing warning related to the hardcoded type of dims to be int in Argmax. --- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 10 +++++----- .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 20 +++++++++---------- .../Eigen/CXX11/src/Tensor/TensorBase.h | 8 ++++---- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../src/Tensor/TensorSyclExtractFunctors.h | 10 +++++----- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 9 ++------- 7 files changed, 28 insertions(+), 33 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index e81001c6e..44b79e725 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -178,7 +178,7 @@ class TensorTupleReducerOp : public TensorBase, Devi EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } #else // following functions are required by sycl EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleType* data() const { return m_impl.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;} @@ -303,7 +303,7 @@ struct TensorEvaluator, Devi protected: TensorEvaluator, Device> m_orig_impl; TensorEvaluator >, Device> m_impl; - const int m_return_dim; + const Index m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 90cbe004f..8f76b8254 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -61,9 +61,9 @@ class TensorTupleReducerDeviceOp : public TensorBase::type m_xpr; - const int m_return_dim; - const StrideDims& m_strides; + const Index m_return_dim; + const StrideDims m_strides; const Index m_stride_mod; const Index m_stride_div; }; @@ -137,10 +137,10 @@ typedef typename MakeGlobalPointer m_impl; - const int m_return_dim; - const StrideDims& m_strides; - const Index& m_stride_mod; - const Index& m_stride_div; + const Index m_return_dim; + const StrideDims m_strides; + const Index m_stride_mod; + const Index m_stride_div; }; } // end namespace Eigen #endif //UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_ARGMAX_SYCL_HPP diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fbe340820..5b1235826 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -619,7 +619,7 @@ class TensorBase const array, const Derived> argmax() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, @@ -632,7 +632,7 @@ class TensorBase const array, const Derived> argmin() const { array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, @@ -643,7 +643,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, const Derived> - argmax(const int return_dim) const { + argmax(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -656,7 +656,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, const Derived> - argmin(const int return_dim) const { + argmin(const Index return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index cead2eac8..1a105165d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -268,7 +268,7 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; // required by sycl - const PatchDim& patch_dims; + const PatchDim patch_dims; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e341e2e9b..597f3f9ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -791,7 +791,7 @@ static const bool RunningOnGPU = false; typename MakePointer_::Type m_result; const Device& m_device; - const Dims& m_xpr_dims; + const Dims m_xpr_dims; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 942e9d307..a7905706d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -230,15 +230,15 @@ template\ typedef typename Evaluator::Index Index;\ typedef typename Eigen::internal::conditional, typename Evaluator::Dimensions >::type Dimensions;\ const Dimensions m_dimensions;\ - const int m_return_dim;\ + const Index m_return_dim;\ const StrideDims m_strides;\ const Index m_stride_mod;\ const Index m_stride_div;\ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - EIGEN_STRONG_INLINE int return_dim() const {return m_return_dim;}\ - EIGEN_STRONG_INLINE const StrideDims& strides() const {return m_strides;}\ - EIGEN_STRONG_INLINE const Index& stride_mod() const {return m_stride_mod;}\ - EIGEN_STRONG_INLINE const Index& stride_div() const {return m_stride_div;}\ + EIGEN_STRONG_INLINE Index return_dim() const {return m_return_dim;}\ + EIGEN_STRONG_INLINE const StrideDims strides() const {return m_strides;}\ + EIGEN_STRONG_INLINE const Index stride_mod() const {return m_stride_mod;}\ + EIGEN_STRONG_INLINE const Index stride_div() const {return m_stride_div;}\ FunctorExtractor(const TensorEvaluator, Device>& expr)\ : m_dimensions(DimConstr::getDim(expr.dimensions())), m_return_dim(expr.return_dim()),\ m_strides(expr.strides()), m_stride_mod(expr.stride_mod()), m_stride_div(expr.stride_div()){}\ diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index b80cb8f06..521a7f82c 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -143,10 +143,6 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) } } - - - - template static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) { @@ -242,9 +238,8 @@ template void sycl_argmax_test_per_ } void test_cxx11_tensor_argmax_sycl() { -// TODO {lukei}: re-enable once fixed -/* for (const auto& device :Eigen::get_sycl_supported_devices()) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_argmax_test_per_device(device)); } -*/ + } From 73fcaa319fcd12328e0577042862db471488fd5c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Mar 2017 08:22:25 -0700 Subject: [PATCH 081/680] Gate the sycl specific code under #ifdef sycl --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 9 +++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h | 10 ++++++---- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 ++++---- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 44b79e725..85facfb64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -228,7 +228,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()), m_device(device) { + m_return_dim(op.return_dim()) +#ifdef EIGEN_USE_SYCL + ,m_device(device) +#endif + { gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { @@ -307,8 +311,9 @@ struct TensorEvaluator, Devi StrideDims m_strides; Index m_stride_mod; Index m_stride_div; - // required by sycl +#ifdef EIGEN_USE_SYCL const Device& m_device; +#endif }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index c72d79435..7e4c129bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -140,8 +140,9 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } +#endif protected: EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { @@ -298,8 +299,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Generator& functor() const { return m_generator; } - +#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -183,8 +182,9 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; array m_strides; Generator m_generator; - // required by sycl +#ifdef EIGEN_USE_SYCL TensorEvaluator m_argImpl; +#endif }; } // end namespace Eigen From e2d5d4e7b3ac51152e26de2d00eabb26ee2a4454 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Mar 2017 08:26:13 -0700 Subject: [PATCH 082/680] Restore the old constructors to retain compatibility with non c++11 compilers. --- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 2fb6b84b9..0ac697087 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -70,8 +70,12 @@ class TensorImagePatchOp : public TensorBase Date: Fri, 31 Mar 2017 08:31:28 -0700 Subject: [PATCH 083/680] Restored code compatibility with compilers that dont support c++11 Gated more sycl code under #ifdef sycl --- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 9 ++++----- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 16 ++++++++++------ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index b6bf05fed..af6ecf5f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -215,11 +215,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } - +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Strides& functor() const { return m_strides; } +#endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 1a105165d..415021510 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -255,10 +255,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return patch_dims; } +#endif protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 64474ee80..33eb1b297 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -64,9 +64,13 @@ class TensorVolumePatchOp : public TensorBase Date: Fri, 31 Mar 2017 20:32:16 -0400 Subject: [PATCH 084/680] fix typos in the Tensor readme --- unsupported/Eigen/CXX11/src/Tensor/README.md | 18 +++++++++--------- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 38cdb9c69..8d989f13a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -83,8 +83,8 @@ large enough to hold all the data. // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. - Tensor> t_4x3; - TensorMap> t_12(t_4x3, 12); + TensorFixedSize> t_4x3; + TensorMap> t_12(t_4x3.data(), 12); #### Class TensorRef @@ -272,7 +272,7 @@ Operation to a TensorFixedSize instead of a Tensor, which is a bit more efficient. // We know that the result is a 4x4x2 tensor! - TensorFixedSize result = t5; + TensorFixedSize> result = t5; Simiarly, assigning an expression to a TensorMap causes its evaluation. Like tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to @@ -296,7 +296,7 @@ the expression in a temporary Tensor of the right size. The code above in effect does: // .eval() knows the size! - TensorFixedSize tmp = t1 + t2; + TensorFixedSize> tmp = t1 + t2; Tensor result = (tmp * 0.2f).exp(); Note that the return value of ```eval()``` is itself an Operation, so the @@ -567,11 +567,11 @@ to the rank of the tensor. The content of the tensor is not initialized. ### TensorFixedSize -Creates a tensor of the specified size. The number of arguments in the Size<> +Creates a tensor of the specified size. The number of arguments in the Sizes<> template parameter determines the rank of the tensor. The content of the tensor is not initialized. - Eigen::TensorFixedSize> a; + Eigen::TensorFixedSize> a; cout << "Rank: " << a.rank() << endl; => Rank: 2 cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; @@ -584,11 +584,11 @@ until the TensorMap is discarded, and the size of the data must be large enough to accomodate of the coefficients of the tensor. float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - Eigen::TensorMap a(data, 3, 4); + Eigen::TensorMap> a(data, 3, 4); cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; => NumRows: 3 NumCols: 4 cout << "a(1, 2): " << a(1, 2) << endl; - => a(1, 2): 9 + => a(1, 2): 7 ## Contents Initialization @@ -1314,7 +1314,7 @@ The previous example can be rewritten as follow: Eigen::Tensor a(2, 3); a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); Eigen::array two_dim({2, 3}); - Eigen::Tensor b; + Eigen::Tensor b(6); b.reshape(two_dim) = a; cout << "b" << endl << b << endl; => diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index fcee5f60d..e943757ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -20,7 +20,7 @@ namespace Eigen { * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is - * Eigen::TensorFixedSize> t; + * Eigen::TensorFixedSize> t; */ template From bc050ea9f02f54c250858d5b6d24b226a3359021 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:47:04 -0700 Subject: [PATCH 085/680] Fixed compilation error when sycl is enabled. --- unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 220bd7ad0..11ae21be9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -97,9 +97,13 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()), m_argImpl(op.expression(), device) + : m_generator(op.generator()) +#ifdef EIGEN_USE_SYCL + , m_argImpl(op.expression(), device) +#endif { - m_dimensions = m_argImpl.dimensions(); + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; From 63840d4666f5f92fd235ef30862db06706e928b4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:54:31 -0700 Subject: [PATCH 086/680] iGate the sycl specific code under a EIGEN_USE_SYCL define --- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 85facfb64..c0f33ba2d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -119,11 +119,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - // required by sycl +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - +#endif protected: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0ac697087..e1d5541bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -70,7 +70,7 @@ class TensorImagePatchOp : public TensorBase, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_op(op) + : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -423,9 +426,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - // required by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -506,8 +510,10 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; TensorEvaluator m_impl; - // required for sycl + +#ifdef EIGEN_USE_SYCL const XprType& m_op; +#endif }; From e3e343390adf05e091b605bf4975c6e9f495e8a1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:56:33 -0700 Subject: [PATCH 087/680] Guard the sycl specific code with a #ifdef EIGEN_USE_SYCL --- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 415021510..6e8e7885b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -99,11 +99,14 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), patch_dims(op.patch_dims()) + : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_patch_dims(op.patch_dims()) +#endif { Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - // const PatchDim& patch_dims = op.patch_dims(); + const PatchDim& patch_dims = op.patch_dims(); if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims-1; ++i) { m_dimensions[i] = patch_dims[i]; @@ -257,7 +260,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return patch_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PatchDim& functor() const { return m_patch_dims; } #endif protected: @@ -267,8 +270,10 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; - // required by sycl - const PatchDim patch_dims; + +#ifdef EIGEN_USE_SYCL + const PatchDim m_patch_dims; +#endif }; } // end namespace Eigen From 66c63826bdd550da08e86efd31c7cf0b793b1526 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 09:59:09 -0700 Subject: [PATCH 088/680] Guard the sycl specific code with EIGEN_USE_SYCL --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 597f3f9ae..7356334e1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -421,7 +421,10 @@ struct TensorEvaluator, static const bool RunningFullReduction = (NumOutputDims==0); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) +#if defined(EIGEN_USE_SYCL) + , m_xpr_dims(op.dims()) +#endif { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -675,13 +678,12 @@ struct TensorEvaluator, } EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} +#if defined(EIGEN_USE_SYCL) + const TensorEvaluator& impl() const { return m_impl; } + const Device& device() const { return m_device; } + const Dims& xprDims() const { return m_xpr_dims; } +#endif private: template friend struct internal::GenericDimReducer; @@ -791,7 +793,10 @@ static const bool RunningOnGPU = false; typename MakePointer_::Type m_result; const Device& m_device; + +#if defined(EIGEN_USE_SYCL) const Dims m_xpr_dims; +#endif }; } // end namespace Eigen From a1304b95b7b3d667a7f04f1a75bd9427b2a59474 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:00:46 -0700 Subject: [PATCH 089/680] Code cleanup --- unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 33eb1b297..ea404242d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -325,7 +325,6 @@ struct TensorEvaluator, D m_outputPlanesRows = m_outputPlanes * m_outputRows; // Fast representations of different variables. - // printf("THis is m_otherStride: %lu\n", m_otherStride ); m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); From a5a0c8fac1d86b61501199c79d4cde36f9e22305 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:03:21 -0700 Subject: [PATCH 090/680] Guard sycl specific code under a EIGEN_USE_SYCL ifdef --- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index ea404242d..f7b28cf66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -64,8 +64,8 @@ class TensorVolumePatchOp : public TensorBase, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) #endif - : m_impl(op.expression(), device), m_op(op) + : m_impl(op.expression(), device) +#ifdef EIGEN_USE_SYCL + , m_op(op) +#endif { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -510,8 +513,10 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } - // required by sycl + +#ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } +#endif Index planePaddingTop() const { return m_planePaddingTop; } Index rowPaddingTop() const { return m_rowPaddingTop; } @@ -607,8 +612,10 @@ struct TensorEvaluator, D Scalar m_paddingValue; TensorEvaluator m_impl; -// required by sycl + +#ifdef EIGEN_USE_SYCL XprType m_op; +#endif }; From c302ea7bc417ef479626266e15bff59a805e305f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:05:16 -0700 Subject: [PATCH 091/680] Deleted empty line of code --- unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index f7b28cf66..b0f970ae6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -359,7 +359,6 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - // Patch index corresponding to the passed in index. const Index patchIndex = index / m_fastPatchStride; From 068cc0970890b534d65dbc99e6b5795acbaaa801 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Apr 2017 10:09:10 -0700 Subject: [PATCH 092/680] Preserve file naming conventions --- unsupported/test/CMakeLists.txt | 4 ++-- ...age_patchOP_sycl.cpp => cxx11_tensor_image_patch_sycl.cpp} | 0 ...me_patchOP_sycl.cpp => cxx11_tensor_volume_patch_sycl.cpp} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename unsupported/test/{cxx11_tensor_image_patchOP_sycl.cpp => cxx11_tensor_image_patch_sycl.cpp} (100%) rename unsupported/test/{cxx11_tensor_volume_patchOP_sycl.cpp => cxx11_tensor_volume_patch_sycl.cpp} (100%) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 4a558f856..cdf151f15 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -171,8 +171,8 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_image_patch_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_volume_patcP_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) diff --git a/unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp similarity index 100% rename from unsupported/test/cxx11_tensor_image_patchOP_sycl.cpp rename to unsupported/test/cxx11_tensor_image_patch_sycl.cpp diff --git a/unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp similarity index 100% rename from unsupported/test/cxx11_tensor_volume_patchOP_sycl.cpp rename to unsupported/test/cxx11_tensor_volume_patch_sycl.cpp From 0f83aeb6b22840c21c3fc2b90d3af18a65a0798a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 14 Apr 2017 10:22:12 +0200 Subject: [PATCH 093/680] Improve cmake scripts for Pastix and BLAS detection. --- bench/spbench/CMakeLists.txt | 27 +- cmake/FindBLAS.cmake | 1503 ++++++++++++++++++++++++++++------ cmake/FindBLASEXT.cmake | 380 +++++++++ cmake/FindHWLOC.cmake | 331 ++++++++ cmake/FindMetis.cmake | 299 +++++-- cmake/FindPTSCOTCH.cmake | 423 ++++++++++ cmake/FindPastix.cmake | 717 +++++++++++++++- cmake/FindScotch.cmake | 379 ++++++++- test/CMakeLists.txt | 29 +- 9 files changed, 3726 insertions(+), 362 deletions(-) create mode 100644 cmake/FindBLASEXT.cmake create mode 100644 cmake/FindHWLOC.cmake create mode 100644 cmake/FindPTSCOTCH.cmake diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt index 8d53f4ae2..932735698 100644 --- a/bench/spbench/CMakeLists.txt +++ b/bench/spbench/CMakeLists.txt @@ -38,25 +38,32 @@ if(SUPERLU_FOUND AND BLAS_FOUND) endif() -find_package(Pastix) -find_package(Scotch) -find_package(Metis) -if(PASTIX_FOUND AND BLAS_FOUND) +find_package(PASTIX QUIET COMPONENTS METIS SCOTCH) +# check that the PASTIX found is a version without MPI +find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS + NAMES pastix_nompi.h + HINTS ${PASTIX_INCLUDE_DIRS} +) +if (NOT PASTIX_pastix_nompi.h_INCLUDE_DIRS) + message(STATUS "A version of Pastix has been found but pastix_nompi.h does not exist in the include directory." + " Because Eigen tests require a version without MPI, we disable the Pastix backend.") +endif() +if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS AND BLAS_FOUND) add_definitions("-DEIGEN_PASTIX_SUPPORT") - include_directories(${PASTIX_INCLUDES}) + include_directories(${PASTIX_INCLUDE_DIRS_DEP}) if(SCOTCH_FOUND) - include_directories(${SCOTCH_INCLUDES}) + include_directories(${SCOTCH_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${SCOTCH_LIBRARIES}) elseif(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES}) endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) endif(PASTIX_FOUND AND BLAS_FOUND) if(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set (SPARSE_LIBS ${SPARSE_LIBS} ${METIS_LIBRARIES}) add_definitions("-DEIGEN_METIS_SUPPORT") endif(METIS_FOUND) diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake index 68c4e0724..9f74b07fe 100644 --- a/cmake/FindBLAS.cmake +++ b/cmake/FindBLAS.cmake @@ -1,385 +1,1363 @@ -# Find BLAS library +### # -# This module finds an installed library that implements the BLAS +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find BLAS library +# This module finds an installed fortran library that implements the BLAS # linear-algebra interface (see http://www.netlib.org/blas/). -# The list of libraries searched for is mainly taken +# The list of libraries searched for is taken # from the autoconf macro file, acx_blas.m4 (distributed at # http://ac-archive.sourceforge.net/ac-archive/acx_blas.html). # # This module sets the following variables: # BLAS_FOUND - set to true if a library implementing the BLAS interface # is found -# BLAS_INCLUDE_DIR - Directories containing the BLAS header files -# BLAS_DEFINITIONS - Compilation options to use BLAS -# BLAS_LINKER_FLAGS - Linker flags to use BLAS (excluding -l +# BLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l # and -L). -# BLAS_LIBRARIES_DIR - Directories containing the BLAS libraries. -# May be null if BLAS_LIBRARIES contains libraries name using full path. -# BLAS_LIBRARIES - List of libraries to link against BLAS interface. -# May be null if the compiler supports auto-link (e.g. VC++). -# BLAS_USE_FILE - The name of the cmake module to include to compile -# applications or libraries using BLAS. +# BLAS_COMPILER_FLAGS - uncached list of required compiler flags (including -I for mkl headers). +# BLAS_LIBRARIES - uncached list of libraries (using full path name) to +# link against to use BLAS +# BLAS95_LIBRARIES - uncached list of libraries (using full path name) +# to link against to use BLAS95 interface +# BLAS95_FOUND - set to true if a library implementing the BLAS f95 interface +# is found +# BLA_STATIC if set on this determines what kind of linkage we do (static) +# BLA_VENDOR if set checks only the specified vendor, if not set checks +# all the possibilities +# BLAS_VENDOR_FOUND stores the BLAS vendor found +# BLA_F95 if set on tries to find the f95 interfaces for BLAS/LAPACK +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DBLAS_DIR=path/to/blas): +# BLAS_DIR - Where to find the base directory of blas +# BLAS_INCDIR - Where to find the header files +# BLAS_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: BLAS_DIR, BLAS_INCDIR, BLAS_LIBDIR +# For MKL case and if no paths are given as hints, we will try to use the MKLROOT +# environment variable +# BLAS_VERBOSE Print some additional information during BLAS libraries detection +########## +### List of vendors (BLA_VENDOR) valid in this module +########## List of vendors (BLA_VENDOR) valid in this module +## Open (for OpenBlas), Eigen (for EigenBlas), Goto, ATLAS PhiPACK, +##  CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT +## Intel10_32 (intel mkl v10 32 bit), Intel10_64lp (intel mkl v10 64 bit,lp thread model, lp64 model), +## Intel10_64lp_seq (intel mkl v10 64 bit,sequential code, lp64 model), +## Intel( older versions of mkl 32 and 64 bit), +##  ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic +# C/CXX should be enabled to use Intel mkl +### +# We handle different modes to find the dependency # -# This module was modified by CGAL team: -# - find libraries for a C++ compiler, instead of Fortran -# - added BLAS_INCLUDE_DIR, BLAS_DEFINITIONS and BLAS_LIBRARIES_DIR -# - removed BLAS95_LIBRARIES +# - Detection if already installed on the system +# - BLAS libraries can be detected from different ways +# Here is the order of precedence: +# 1) we look in cmake variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined +# 2) we look in environment variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined +# 3) we look in common environnment variables depending on the system (INCLUDE, C_INCLUDE_PATH, CPATH - LIB, DYLD_LIBRARY_PATH, LD_LIBRARY_PATH) +# 4) we look in common system paths depending on the system, see for example paths contained in the following cmake variables: +# - CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES +# - CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_C_IMPLICIT_LINK_DIRECTORIES +# + +#============================================================================= +# Copyright 2007-2009 Kitware, Inc. +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +## Some macros to print status when search for headers and libs +# This macro informs why the _lib_to_find file has not been found +macro(Print_Find_Library_Blas_Status _libname _lib_to_find) + + # save _libname upper/lower case + string(TOUPPER ${_libname} LIBNAME) + string(TOLOWER ${_libname} libname) + + # print status + #message(" ") + if(${LIBNAME}_LIBDIR) + message("${Yellow}${LIBNAME}_LIBDIR is defined but ${_lib_to_find}" + "has not been found in ${ARGN}${ColourReset}") + else() + if(${LIBNAME}_DIR) + message("${Yellow}${LIBNAME}_DIR is defined but ${_lib_to_find}" + "has not been found in ${ARGN}${ColourReset}") + else() + message("${Yellow}${_lib_to_find} not found." + "Nor ${LIBNAME}_DIR neither ${LIBNAME}_LIBDIR" + "are defined so that we look for ${_lib_to_find} in" + "system paths (Linux: LD_LIBRARY_PATH, Windows: LIB," + "Mac: DYLD_LIBRARY_PATH," + "CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES," + "CMAKE_C_IMPLICIT_LINK_DIRECTORIES)${ColourReset}") + if(_lib_env) + message("${Yellow}${_lib_to_find} has not been found in" + "${_lib_env}${ColourReset}") + endif() + endif() + endif() + message("${BoldYellow}Please indicate where to find ${_lib_to_find}. You have three options:\n" + "- Option 1: Provide the Installation directory of BLAS library with cmake option: -D${LIBNAME}_DIR=your/path/to/${libname}/\n" + "- Option 2: Provide the directory where to find the library with cmake option: -D${LIBNAME}_LIBDIR=your/path/to/${libname}/lib/\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "- Option 4: If your library provides a PkgConfig file, make sure pkg-config finds your library${ColourReset}") + +endmacro() + +# This macro informs why the _lib_to_find file has not been found +macro(Print_Find_Library_Blas_CheckFunc_Status _name) + + # save _libname upper/lower case + string(TOUPPER ${_name} FUNCNAME) + string(TOLOWER ${_name} funcname) + + # print status + #message(" ") + message("${Red}Libs have been found but check of symbol ${_name} failed " + "with following libraries ${ARGN}${ColourReset}") + message("${BoldRed}Please open your error file CMakeFiles/CMakeError.log" + "to figure out why it fails${ColourReset}") + #message(" ") + +endmacro() + +if (NOT BLAS_FOUND) + set(BLAS_DIR "" CACHE PATH "Installation directory of BLAS library") + if (NOT BLAS_FIND_QUIETLY) + message(STATUS "A cache variable, namely BLAS_DIR, has been set to specify the install directory of BLAS") + endif() +endif() + +option(BLAS_VERBOSE "Print some additional information during BLAS libraries detection" OFF) +mark_as_advanced(BLAS_VERBOSE) include(CheckFunctionExists) +include(CheckFortranFunctionExists) +set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) -# This macro checks for the existence of the combination of fortran libraries -# given by _list. If the combination is found, this macro checks (using the -# check_function_exists macro) whether can link against that library -# combination using the name of a routine given by _name using the linker -# flags given by _flags. If the combination of libraries is found and passes -# the link test, LIBRARIES is set to the list of complete library paths that -# have been found and DEFINITIONS to the required definitions. -# Otherwise, LIBRARIES is set to FALSE. -# N.B. _prefix is the prefix applied to the names of all cached variables that -# are generated internally and marked advanced by this macro. -macro(check_fortran_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _path) - #message("DEBUG: check_fortran_libraries(${_list} in ${_path})") +# Check the language being used +get_property( _LANGUAGES_ GLOBAL PROPERTY ENABLED_LANGUAGES ) +if( _LANGUAGES_ MATCHES Fortran ) + set( _CHECK_FORTRAN TRUE ) +elseif( (_LANGUAGES_ MATCHES C) OR (_LANGUAGES_ MATCHES CXX) ) + set( _CHECK_FORTRAN FALSE ) +else() + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.") + else() + message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)") + return() + endif() +endif() - # Check for the existence of the libraries given by _list - set(_libraries_found TRUE) - set(_libraries_work FALSE) - set(${DEFINITIONS} "") - set(${LIBRARIES} "") +macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) + # This macro checks for the existence of the combination of fortran libraries + # given by _list. If the combination is found, this macro checks (using the + # Check_Fortran_Function_Exists macro) whether can link against that library + # combination using the name of a routine given by _name using the linker + # flags given by _flags. If the combination of libraries is found and passes + # the link test, LIBRARIES is set to the list of complete library paths that + # have been found. Otherwise, LIBRARIES is set to FALSE. + + # N.B. _prefix is the prefix applied to the names of all cached variables that + # are generated internally and marked advanced by this macro. + + set(_libdir ${ARGN}) + + set(_libraries_work TRUE) + set(${LIBRARIES}) set(_combined_name) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") + set(ENV_BLAS_LIBDIR "$ENV{BLAS_LIBDIR}") + if (NOT _libdir) + if (BLAS_LIBDIR) + list(APPEND _libdir "${BLAS_LIBDIR}") + elseif (BLAS_DIR) + list(APPEND _libdir "${BLAS_DIR}") + list(APPEND _libdir "${BLAS_DIR}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${BLAS_DIR}/lib64") + list(APPEND _libdir "${BLAS_DIR}/lib/intel64") + else() + list(APPEND _libdir "${BLAS_DIR}/lib32") + list(APPEND _libdir "${BLAS_DIR}/lib/ia32") + endif() + elseif(ENV_BLAS_LIBDIR) + list(APPEND _libdir "${ENV_BLAS_LIBDIR}") + elseif(ENV_BLAS_DIR) + list(APPEND _libdir "${ENV_BLAS_DIR}") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib64") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib/intel64") + else() + list(APPEND _libdir "${ENV_BLAS_DIR}/lib32") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib/ia32") + endif() + else() + if (ENV_MKLROOT) + list(APPEND _libdir "${ENV_MKLROOT}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${ENV_MKLROOT}/lib64") + list(APPEND _libdir "${ENV_MKLROOT}/lib/intel64") + else() + list(APPEND _libdir "${ENV_MKLROOT}/lib32") + list(APPEND _libdir "${ENV_MKLROOT}/lib/ia32") + endif() + endif() + if (WIN32) + string(REPLACE ":" ";" _libdir2 "$ENV{LIB}") + elseif (APPLE) + string(REPLACE ":" ";" _libdir2 "$ENV{DYLD_LIBRARY_PATH}") + else () + string(REPLACE ":" ";" _libdir2 "$ENV{LD_LIBRARY_PATH}") + endif () + list(APPEND _libdir "${_libdir2}") + list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif () + + if (BLAS_VERBOSE) + message("${Cyan}Try to find BLAS libraries: ${_list}") + endif () + foreach(_library ${_list}) set(_combined_name ${_combined_name}_${_library}) - if(_libraries_found) - # search first in ${_path} - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS ${_path} NO_DEFAULT_PATH - ) - # if not found, search in environment variables and system - if ( WIN32 ) - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS ENV LIB - ) - elseif ( APPLE ) - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH - ) + if(_libraries_work) + if (BLA_STATIC) + if (WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + if (APPLE) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else () + set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () else () - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - ) - endif() + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + # for ubuntu's libblas3gf and liblapack3gf packages + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf) + endif () + endif () + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + HINTS ${_libdir} + NO_DEFAULT_PATH + ) mark_as_advanced(${_prefix}_${_library}_LIBRARY) + # Print status if not found + # ------------------------- + if (NOT ${_prefix}_${_library}_LIBRARY AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) + Print_Find_Library_Blas_Status(blas ${_library} ${_libdir}) + endif () set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) - set(_libraries_found ${${_prefix}_${_library}_LIBRARY}) - endif(_libraries_found) + set(_libraries_work ${${_prefix}_${_library}_LIBRARY}) + endif(_libraries_work) endforeach(_library ${_list}) - if(_libraries_found) - set(_libraries_found ${${LIBRARIES}}) - endif() - # Test this combination of libraries with the Fortran/f2c interface. - # We test the Fortran interface first as it is well standardized. - if(_libraries_found AND NOT _libraries_work) - set(${DEFINITIONS} "-D${_prefix}_USE_F2C") - set(${LIBRARIES} ${_libraries_found}) - # Some C++ linkers require the f2c library to link with Fortran libraries. - # I do not know which ones, thus I just add the f2c library if it is available. - find_package( F2C QUIET ) - if ( F2C_FOUND ) - set(${DEFINITIONS} ${${DEFINITIONS}} ${F2C_DEFINITIONS}) - set(${LIBRARIES} ${${LIBRARIES}} ${F2C_LIBRARIES}) + if(_libraries_work) + # Test this combination of libraries. + if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC) + list(INSERT ${LIBRARIES} 0 "-Wl,--start-group") + list(APPEND ${LIBRARIES} "-Wl,--end-group") endif() - set(CMAKE_REQUIRED_DEFINITIONS ${${DEFINITIONS}}) - set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}}) - #message("DEBUG: CMAKE_REQUIRED_DEFINITIONS = ${CMAKE_REQUIRED_DEFINITIONS}") - #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") - # Check if function exists with f2c calling convention (ie a trailing underscore) - check_function_exists(${_name}_ ${_prefix}_${_name}_${_combined_name}_f2c_WORKS) - set(CMAKE_REQUIRED_DEFINITIONS} "") - set(CMAKE_REQUIRED_LIBRARIES "") - mark_as_advanced(${_prefix}_${_name}_${_combined_name}_f2c_WORKS) - set(_libraries_work ${${_prefix}_${_name}_${_combined_name}_f2c_WORKS}) - endif(_libraries_found AND NOT _libraries_work) - - # If not found, test this combination of libraries with a C interface. - # A few implementations (ie ACML) provide a C interface. Unfortunately, there is no standard. - if(_libraries_found AND NOT _libraries_work) - set(${DEFINITIONS} "") - set(${LIBRARIES} ${_libraries_found}) - set(CMAKE_REQUIRED_DEFINITIONS "") - set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}}) - #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") - check_function_exists(${_name} ${_prefix}_${_name}${_combined_name}_WORKS) - set(CMAKE_REQUIRED_LIBRARIES "") - mark_as_advanced(${_prefix}_${_name}${_combined_name}_WORKS) - set(_libraries_work ${${_prefix}_${_name}${_combined_name}_WORKS}) - endif(_libraries_found AND NOT _libraries_work) - - # on failure - if(NOT _libraries_work) - set(${DEFINITIONS} "") - set(${LIBRARIES} FALSE) + set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}") + set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}") + if (BLAS_VERBOSE) + message("${Cyan}BLAS libs found for BLA_VENDOR ${BLA_VENDOR}." + "Try to compile symbol ${_name} with following libraries:" + "${CMAKE_REQUIRED_LIBRARIES}") + endif () + if(NOT BLAS_FOUND) + unset(${_prefix}${_combined_name}_WORKS CACHE) + endif() + if (_CHECK_FORTRAN) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + string(REPLACE "mkl_intel_lp64" "mkl_gf_lp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + string(REPLACE "mkl_intel_ilp64" "mkl_gf_ilp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + endif() + check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS) + else() + check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS) + endif() + mark_as_advanced(${_prefix}${_combined_name}_WORKS) + set(_libraries_work ${${_prefix}${_combined_name}_WORKS}) + # Print status if not found + # ------------------------- + if (NOT _libraries_work AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) + Print_Find_Library_Blas_CheckFunc_Status(${_name} ${CMAKE_REQUIRED_LIBRARIES}) + endif () + set(CMAKE_REQUIRED_LIBRARIES) endif() - #message("DEBUG: ${DEFINITIONS} = ${${DEFINITIONS}}") - #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}") -endmacro(check_fortran_libraries) + + if(_libraries_work) + set(${LIBRARIES} ${${LIBRARIES}} ${_thread}) + else(_libraries_work) + set(${LIBRARIES} FALSE) + endif(_libraries_work) + +endmacro(Check_Fortran_Libraries) -# -# main -# +set(BLAS_LINKER_FLAGS) +set(BLAS_LIBRARIES) +set(BLAS95_LIBRARIES) +if ($ENV{BLA_VENDOR} MATCHES ".+") + set(BLA_VENDOR $ENV{BLA_VENDOR}) +else () + if(NOT BLA_VENDOR) + set(BLA_VENDOR "All") + endif() +endif () -# Is it already configured? -if (BLAS_LIBRARIES_DIR OR BLAS_LIBRARIES) +#BLAS in intel mkl 10 library? (em64t 64bit) +if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") - set(BLAS_FOUND TRUE) + if(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*") + # Looking for include + # ------------------- -else() + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") + set(ENV_BLAS_INCDIR "$ENV{BLAS_INCDIR}") + if(ENV_BLAS_INCDIR) + list(APPEND _inc_env "${ENV_BLAS_INCDIR}") + elseif(ENV_BLAS_DIR) + list(APPEND _inc_env "${ENV_BLAS_DIR}") + list(APPEND _inc_env "${ENV_BLAS_DIR}/include") + else() + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + # system variables + if(WIN32) + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) - # reset variables - set( BLAS_INCLUDE_DIR "" ) - set( BLAS_DEFINITIONS "" ) - set( BLAS_LINKER_FLAGS "" ) - set( BLAS_LIBRARIES "" ) - set( BLAS_LIBRARIES_DIR "" ) + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") - # - # If Unix, search for BLAS function in possible libraries - # + # Try to find the fftw header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(BLAS_INCDIR) + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${BLAS_INCDIR}) + else() + if(BLAS_DIR) + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${BLAS_DIR} + PATH_SUFFIXES "include") + else() + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${PATH_TO_LOOK_FOR}) + endif() + endif() + mark_as_advanced(BLAS_mkl.h_DIRS) - # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + # If found, add path to cmake variable + # ------------------------------------ + if (BLAS_mkl.h_DIRS) + set(BLAS_INCLUDE_DIRS "${BLAS_mkl.h_DIRS}") + else () + set(BLAS_INCLUDE_DIRS "BLAS_INCLUDE_DIRS-NOTFOUND") + if(NOT BLAS_FIND_QUIETLY) + message(STATUS "Looking for BLAS -- mkl.h not found") + endif() + endif() + + if (WIN32) + string(REPLACE ":" ";" _libdir "$ENV{LIB}") + elseif (APPLE) + string(REPLACE ":" ";" _libdir "$ENV{DYLD_LIBRARY_PATH}") + else () + string(REPLACE ":" ";" _libdir "$ENV{LD_LIBRARY_PATH}") + endif () + list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + # libiomp5 + # -------- + set(OMP_iomp5_LIBRARY "OMP_iomp5_LIBRARY-NOTFOUND") + find_library(OMP_iomp5_LIBRARY + NAMES iomp5 + HINTS ${_libdir} + ) + mark_as_advanced(OMP_iomp5_LIBRARY) + set(OMP_LIB "") + # libgomp + # ------- + set(OMP_gomp_LIBRARY "OMP_gomp_LIBRARY-NOTFOUND") + find_library(OMP_gomp_LIBRARY + NAMES gomp + HINTS ${_libdir} + ) + mark_as_advanced(OMP_gomp_LIBRARY) + # choose one or another depending on the compilo + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + if (OMP_gomp_LIBRARY) + set(OMP_LIB "${OMP_gomp_LIBRARY}") + endif() + else(CMAKE_C_COMPILER_ID STREQUAL "Intel") + if (OMP_iomp5_LIBRARY) + set(OMP_LIB "${OMP_iomp5_LIBRARY}") + endif() + endif() + + if (UNIX AND NOT WIN32) + # m + find_library(M_LIBRARY + NAMES m + HINTS ${_libdir}) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + set(LM "-lm") + else() + set(LM "") + endif() + # Fortran + set(LGFORTRAN "") + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_libdir} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + set(LGFORTRAN "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_libdir} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + set(LGFORTRAN "{FORTRAN_ifcore_LIBRARY}") + endif() + endif() + set(BLAS_COMPILER_FLAGS "") + if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_COMPILER_FLAGS "-openmp") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_COMPILER_FLAGS "-fopenmp") + endif() + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + if (BLA_VENDOR STREQUAL "Intel10_32") + list(APPEND BLAS_COMPILER_FLAGS "-m32") + else() + list(APPEND BLAS_COMPILER_FLAGS "-m64") + endif() + if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") + list(APPEND OMP_LIB "-ldl") + endif() + if (ENV_MKLROOT) + list(APPEND BLAS_COMPILER_FLAGS "-I${ENV_MKLROOT}/include") + endif() + endif() + + set(additional_flags "") + if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(additional_flags "-Wl,--no-as-needed") + endif() + endif () + + if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) + if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) + find_package(Threads) + else() + find_package(Threads REQUIRED) + endif() + + set(BLAS_SEARCH_LIBS "") + + if(BLA_F95) + + set(BLAS_mkl_SEARCH_SYMBOL SGEMM) + set(_LIBRARIES BLAS95_LIBRARIES) + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95_lp64${BLAS_mkl_DLL_SUFFIX} mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else (WIN32) + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_intel mkl_intel_thread mkl_core guide") + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_intel_lp64 mkl_intel_thread mkl_core guide") + # mkl >= 10.3 + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_lp64 mkl_intel_lp64 mkl_intel_thread mkl_core") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_lp64 mkl_intel_lp64 mkl_gnu_thread mkl_core") + endif() + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_sequential mkl_core") + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") + set(OMP_LIB "") + endif() + endif () + endif (WIN32) + + else (BLA_F95) + + set(BLAS_mkl_SEARCH_SYMBOL sgemm) + set(_LIBRARIES BLAS_LIBRARIES) + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else (WIN32) + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel mkl_intel_thread mkl_core guide") + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_intel_thread mkl_core guide") + # mkl >= 10.3 + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_intel_thread mkl_core") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_gnu_thread mkl_core") + endif() + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_sequential mkl_core") + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") + set(OMP_LIB "") + endif() + endif () + #older vesions of intel mkl libs + if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl") + list(APPEND BLAS_SEARCH_LIBS + "mkl_ia32") + list(APPEND BLAS_SEARCH_LIBS + "mkl_em64t") + endif () + endif (WIN32) + + endif (BLA_F95) + + foreach (IT ${BLAS_SEARCH_LIBS}) + string(REPLACE " " ";" SEARCH_LIBS ${IT}) + if (${_LIBRARIES}) + else () + check_fortran_libraries( + ${_LIBRARIES} + BLAS + ${BLAS_mkl_SEARCH_SYMBOL} + "${additional_flags}" + "${SEARCH_LIBS}" + "${OMP_LIB};${CMAKE_THREAD_LIBS_INIT};${LM}" + ) + if(_LIBRARIES) + set(BLAS_LINKER_FLAGS "${additional_flags}") + endif() + endif() + endforeach () + if(NOT BLAS_FIND_QUIETLY) + if(${_LIBRARIES}) + message(STATUS "Looking for MKL BLAS: found") + else() + message(STATUS "Looking for MKL BLAS: not found") + endif() + endif() + if (${_LIBRARIES} AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Intel MKL") + endif() + endif (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) + endif(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*") +endif (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") + + +if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "cblas;f77blas;atlas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "goto2" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Goto BLAS: found") + else() + message(STATUS "Looking for Goto BLAS: not found") + endif() endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Goto") + endif() - # BLAS in PhiPACK libraries? (requires generic BLAS lib, too) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS +endif (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + + +# OpenBlas +if (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # openblas (http://www.openblas.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "openblas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Open BLAS: found") + else() + message(STATUS "Looking for Open BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Openblas") + endif() + +endif (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All") + + +# EigenBlas +if (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "eigen_blas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + message(STATUS "Looking for Eigen BLAS: found") + else() + message(STATUS "Looking for Eigen BLAS: not found") + endif() + endif() + endif() + + if(NOT BLAS_LIBRARIES) + # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "eigen_blas_static" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Eigen BLAS: found") + else() + message(STATUS "Looking for Eigen BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Eigen") + endif() + +endif (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All") + + +if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "f77blas;atlas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Atlas BLAS: found") + else() + message(STATUS "Looking for Atlas BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Atlas") + endif() + +endif (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in PhiPACK libraries? (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "sgemm;dgemm;blas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for PhiPACK BLAS: found") + else() + message(STATUS "Looking for PhiPACK BLAS: not found") + endif() endif() + endif() - # BLAS in Alpha CXML library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "PhiPACK") + endif() + +endif (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in Alpha CXML library? +if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "cxml" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for CXML BLAS: found") + else() + message(STATUS "Looking for CXML BLAS: not found") + endif() endif() + endif() - # BLAS in Alpha DXML library? (now called CXML, see above) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "CXML") + endif() + +endif (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in Alpha DXML library? (now called CXML, see above) +if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "dxml" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for DXML BLAS: found") + else() + message(STATUS "Looking for DXML BLAS: not found") + endif() endif() + endif() - # BLAS in Sun Performance library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "DXML") + endif() + +endif (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in Sun Performance library? +if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "-xlic_lib=sunperf" "sunperf;sunmath" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(BLAS_LIBRARIES) + set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") + endif() + if(NOT BLAS_FIND_QUIETLY) if(BLAS_LIBRARIES) - # Extra linker flag - set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") + message(STATUS "Looking for SunPerf BLAS: found") + else() + message(STATUS "Looking for SunPerf BLAS: not found") endif() endif() + endif() - # BLAS in SCSL library? (SGI/Cray Scientific Library) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SunPerf") + endif() + +endif () + + +# BLAS in SCSL library? (SGI/Cray Scientific Library) +if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "scsl" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SCSL BLAS: found") + else() + message(STATUS "Looking for SCSL BLAS: not found") + endif() endif() + endif() - # BLAS in SGIMATH library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SunPerf") + endif() + +endif () + + +# BLAS in SGIMATH library? +if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "complib.sgimath" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SGIMATH BLAS: found") + else() + message(STATUS "Looking for SGIMATH BLAS: not found") + endif() endif() + endif() - # BLAS in IBM ESSL library? (requires generic BLAS lib, too) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SGIMATH") + endif() + +endif () + + +# BLAS in IBM ESSL library (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "essl;blas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "essl;xlfmath;xlf90_r;blas" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for IBM ESSL BLAS: found") + else() + message(STATUS "Looking for IBM ESSL BLAS: not found") + endif() endif() + endif() - #BLAS in intel mkl 10 library? (em64t 64bit) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "IBM ESSL") + endif() + +endif () + +# BLAS in IBM ESSL_MT library (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSLMT" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl_intel_lp64;mkl_intel_thread;mkl_core;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" - ) - endif() - - ### windows version of intel mkl 10? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS - BLAS_LIBRARIES - BLAS - SGEMM + "esslsmp;xlsmp;xlfmath;xlf90_r;blas" "" - "mkl_c_dll;mkl_intel_thread_dll;mkl_core_dll;libguide40" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for IBM ESSL MT BLAS: found") + else() + message(STATUS "Looking for IBM ESSL MT BLAS: not found") + endif() endif() + endif() - #older versions of intel mkl libs + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "IBM ESSL MT") + endif() - # BLAS in intel mkl library? (shared) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS +endif () + + +#BLAS in acml library? +if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") + + if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS))) + + # try to find acml in "standard" paths + if( WIN32 ) + file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" ) + else() + file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" ) + endif() + if( WIN32 ) + file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" ) + else() + file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" ) + endif() + list(GET _ACML_ROOT 0 _ACML_ROOT) + list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT) + + if( _ACML_ROOT ) + + get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH ) + if( SIZEOF_INTEGER EQUAL 8 ) + set( _ACML_PATH_SUFFIX "_int64" ) + else() + set( _ACML_PATH_SUFFIX "" ) + endif() + if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) + set( _ACML_COMPILER32 "ifort32" ) + set( _ACML_COMPILER64 "ifort64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" ) + set( _ACML_COMPILER32 "sun32" ) + set( _ACML_COMPILER64 "sun64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) + set( _ACML_COMPILER32 "pgi32" ) + if( WIN32 ) + set( _ACML_COMPILER64 "win64" ) + else() + set( _ACML_COMPILER64 "pgi64" ) + endif() + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" ) + # 32 bit builds not supported on Open64 but for code simplicity + # We'll just use the same directory twice + set( _ACML_COMPILER32 "open64_64" ) + set( _ACML_COMPILER64 "open64_64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" ) + set( _ACML_COMPILER32 "nag32" ) + set( _ACML_COMPILER64 "nag64" ) + else() + set( _ACML_COMPILER32 "gfortran32" ) + set( _ACML_COMPILER64 "gfortran64" ) + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + set(_ACML_MP_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" ) + else() + set(_ACML_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" ) + endif() + + endif(_ACML_ROOT) + + elseif(BLAS_${BLA_VENDOR}_LIB_DIRS) + + set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS}) + + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + elseif( BLA_VENDOR STREQUAL "ACML_GPU" ) + foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + else() + foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} ) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + endif() + + # Either acml or acml_mp should be in LD_LIBRARY_PATH but not both + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "acml;acml_mv" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() endif() + endif() - #BLAS in intel mkl library? (static, 32bit) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl_ia32;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "acml_mp;acml_mv" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() endif() + endif() - #BLAS in intel mkl library? (static, em64t 64bit) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl_em64t;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" - ) - endif() - - #BLAS in acml library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS - BLAS_LIBRARIES - BLAS - sgemm + "acml;acml_mv;CALBLAS" "" - "acml" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() endif() + endif() - # Apple BLAS library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "ACML") + endif() + +endif (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") # ACML + + +# Apple BLAS library? +if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS - sgemm + dgemm "" "Accelerate" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Apple BLAS: found") + else() + message(STATUS "Looking for Apple BLAS: not found") + endif() endif() + endif() - if ( NOT BLAS_LIBRARIES ) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Apple Accelerate") + endif() + +endif (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + + +if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + + if ( NOT BLAS_LIBRARIES ) + check_fortran_libraries( BLAS_LIBRARIES BLAS - sgemm + dgemm "" "vecLib" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" - ) - endif ( NOT BLAS_LIBRARIES ) - - # Generic BLAS library? - # This configuration *must* be the last try as this library is notably slow. - if ( NOT BLAS_LIBRARIES ) - check_fortran_libraries( - BLAS_DEFINITIONS - BLAS_LIBRARIES - BLAS - sgemm "" - "blas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for NAS BLAS: found") + else() + message(STATUS "Looking for NAS BLAS: not found") + endif() endif() + endif () - if(BLAS_LIBRARIES_DIR OR BLAS_LIBRARIES) + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "NAS") + endif() + +endif (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + + +# Generic BLAS library? +if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") + + set(BLAS_SEARCH_LIBS "blas;blas_LINUX;blas_MAC;blas_WINDOWS;refblas") + foreach (SEARCH_LIB ${BLAS_SEARCH_LIBS}) + if (BLAS_LIBRARIES) + else () + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "${SEARCH_LIB}" + "${LGFORTRAN}" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Generic BLAS: found") + else() + message(STATUS "Looking for Generic BLAS: not found") + endif() + endif() + endif() + endforeach () + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Netlib or other Generic libblas") + endif() + +endif (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") + + +if(BLA_F95) + + if(BLAS95_LIBRARIES) + set(BLAS95_FOUND TRUE) + else() + set(BLAS95_FOUND FALSE) + endif() + + if(NOT BLAS_FIND_QUIETLY) + if(BLAS95_FOUND) + message(STATUS "A library with BLAS95 API found.") + message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") + else(BLAS95_FOUND) + message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas 95 libraries could not be found or check of symbols failed." + "\nPlease indicate where to find blas libraries. You have three options:\n" + "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" + "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." + "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." + "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," + "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR + "A required library with BLAS95 API not found. Please specify library location.") + else() + message(STATUS + "A library with BLAS95 API not found. Please specify library location.") + endif() + endif(BLAS95_FOUND) + endif(NOT BLAS_FIND_QUIETLY) + + set(BLAS_FOUND TRUE) + set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}") + +else(BLA_F95) + + if(BLAS_LIBRARIES) set(BLAS_FOUND TRUE) else() set(BLAS_FOUND FALSE) @@ -388,32 +1366,41 @@ else() if(NOT BLAS_FIND_QUIETLY) if(BLAS_FOUND) message(STATUS "A library with BLAS API found.") + message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") else(BLAS_FOUND) + message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas libraries could not be found or check of symbols failed." + "\nPlease indicate where to find blas libraries. You have three options:\n" + "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" + "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." + "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." + "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," + "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") if(BLAS_FIND_REQUIRED) - message(FATAL_ERROR "A required library with BLAS API not found. Please specify library location.") + message(FATAL_ERROR + "A required library with BLAS API not found. Please specify library location.") else() - message(STATUS "A library with BLAS API not found. Please specify library location.") + message(STATUS + "A library with BLAS API not found. Please specify library location.") endif() endif(BLAS_FOUND) endif(NOT BLAS_FIND_QUIETLY) - # Add variables to cache - set( BLAS_INCLUDE_DIR "${BLAS_INCLUDE_DIR}" - CACHE PATH "Directories containing the BLAS header files" FORCE ) - set( BLAS_DEFINITIONS "${BLAS_DEFINITIONS}" - CACHE STRING "Compilation options to use BLAS" FORCE ) - set( BLAS_LINKER_FLAGS "${BLAS_LINKER_FLAGS}" - CACHE STRING "Linker flags to use BLAS" FORCE ) - set( BLAS_LIBRARIES "${BLAS_LIBRARIES}" - CACHE FILEPATH "BLAS libraries name" FORCE ) - set( BLAS_LIBRARIES_DIR "${BLAS_LIBRARIES_DIR}" - CACHE PATH "Directories containing the BLAS libraries" FORCE ) +endif(BLA_F95) - #message("DEBUG: BLAS_INCLUDE_DIR = ${BLAS_INCLUDE_DIR}") - #message("DEBUG: BLAS_DEFINITIONS = ${BLAS_DEFINITIONS}") - #message("DEBUG: BLAS_LINKER_FLAGS = ${BLAS_LINKER_FLAGS}") - #message("DEBUG: BLAS_LIBRARIES = ${BLAS_LIBRARIES}") - #message("DEBUG: BLAS_LIBRARIES_DIR = ${BLAS_LIBRARIES_DIR}") - #message("DEBUG: BLAS_FOUND = ${BLAS_FOUND}") +set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) -endif(BLAS_LIBRARIES_DIR OR BLAS_LIBRARIES) +if (BLAS_FOUND) + list(GET BLAS_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") + string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") + set(BLAS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of BLAS library" FORCE) + else() + set(BLAS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of BLAS library" FORCE) + endif() +endif() +mark_as_advanced(BLAS_DIR) +mark_as_advanced(BLAS_DIR_FOUND) diff --git a/cmake/FindBLASEXT.cmake b/cmake/FindBLASEXT.cmake new file mode 100644 index 000000000..0fe7fb849 --- /dev/null +++ b/cmake/FindBLASEXT.cmake @@ -0,0 +1,380 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find BLAS EXTENDED for MORSE projects: find include dirs and libraries +# +# This module allows to find BLAS libraries by calling the official FindBLAS module +# and handles the creation of different library lists whether the user wishes to link +# with a sequential BLAS or a multihreaded (BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES). +# BLAS is detected with a FindBLAS call then if the BLAS vendor is Intel10_64lp, ACML +# or IBMESSLMT then the module attempts to find the corresponding multithreaded libraries. +# +# The following variables have been added to manage links with sequential or multithreaded +# versions: +# BLAS_INCLUDE_DIRS - BLAS include directories +# BLAS_LIBRARY_DIRS - Link directories for BLAS libraries +# BLAS_SEQ_LIBRARIES - BLAS component libraries to be linked (sequential) +# BLAS_PAR_LIBRARIES - BLAS component libraries to be linked (multithreaded) + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +# macro to factorize this call +macro(find_package_blas) + if(BLASEXT_FIND_REQUIRED) + if(BLASEXT_FIND_QUIETLY) + find_package(BLAS REQUIRED QUIET) + else() + find_package(BLAS REQUIRED) + endif() + else() + if(BLASEXT_FIND_QUIETLY) + find_package(BLAS QUIET) + else() + find_package(BLAS) + endif() + endif() +endmacro() + +# add a cache variable to let the user specify the BLAS vendor +set(BLA_VENDOR "" CACHE STRING "list of possible BLAS vendor: + Open, Eigen, Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, + Intel10_32 (intel mkl v10 32 bit), + Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), + Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model), + Intel( older versions of mkl 32 and 64 bit), + ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + +if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "In FindBLASEXT") + message(STATUS "If you want to force the use of one specific library, " + "\n please specify the BLAS vendor by setting -DBLA_VENDOR=blas_vendor_name" + "\n at cmake configure.") + message(STATUS "List of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, " + "\n DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, Intel10_32 (intel mkl v10 32 bit)," + "\n Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model)," + "\n Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "\n Intel( older versions of mkl 32 and 64 bit)," + "\n ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") +endif() + +if (NOT BLAS_FOUND) + # First try to detect two cases: + # 1: only SEQ libs are handled + # 2: both SEQ and PAR libs are handled + find_package_blas() +endif () + +# detect the cases where SEQ and PAR libs are handled +if(BLA_VENDOR STREQUAL "All" AND + (BLAS_mkl_core_LIBRARY OR BLAS_mkl_core_dll_LIBRARY) + ) + set(BLA_VENDOR "Intel") + if(BLAS_mkl_intel_LIBRARY) + set(BLA_VENDOR "Intel10_32") + endif() + if(BLAS_mkl_intel_lp64_LIBRARY) + set(BLA_VENDOR "Intel10_64lp") + endif() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the MKL." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +elseif(BLA_VENDOR STREQUAL "All" AND BLAS_acml_LIBRARY) + set(BLA_VENDOR "ACML") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the ACML." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +elseif(BLA_VENDOR STREQUAL "All" AND BLAS_essl_LIBRARY) + set(BLA_VENDOR "IBMESSL") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the ESSL." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +endif() + +# Intel case +if(BLA_VENDOR MATCHES "Intel*") + + ### + # look for include path if the BLAS vendor is Intel + ### + + # gather system include paths + unset(_inc_env) + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + set(ENV_MKLROOT "$ENV{MKLROOT}") + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + list(REMOVE_DUPLICATES _inc_env) + + # find mkl.h inside known include paths + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + if(BLAS_INCDIR) + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${BLAS_INCDIR}) + else() + if(BLAS_DIR) + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${BLAS_DIR} + PATH_SUFFIXES include) + else() + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${_inc_env}) + endif() + endif() + mark_as_advanced(BLAS_mkl.h_INCLUDE_DIRS) + ## Print status if not found + ## ------------------------- + #if (NOT BLAS_mkl.h_INCLUDE_DIRS AND MORSE_VERBOSE) + # Print_Find_Header_Status(blas mkl.h) + #endif () + set(BLAS_INCLUDE_DIRS "") + if(BLAS_mkl.h_INCLUDE_DIRS) + list(APPEND BLAS_INCLUDE_DIRS "${BLAS_mkl.h_INCLUDE_DIRS}" ) + endif() + + ### + # look for libs + ### + # if Intel 10 64 bit -> look for sequential and multithreaded versions + if(BLA_VENDOR MATCHES "Intel10_64lp*") + + ## look for the sequential version + set(BLA_VENDOR "Intel10_64lp_seq") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "Look for the sequential version Intel10_64lp_seq") + endif() + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "Intel10_64lp") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "Look for the multithreaded version Intel10_64lp") + endif() + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + + else() + + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + endif() + + # ACML case +elseif(BLA_VENDOR MATCHES "ACML*") + + ## look for the sequential version + set(BLA_VENDOR "ACML") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "ACML_MP") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + + # IBMESSL case +elseif(BLA_VENDOR MATCHES "IBMESSL*") + + ## look for the sequential version + set(BLA_VENDOR "IBMESSL") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "IBMESSLMT") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + +else() + + if(BLAS_FOUND) + # define the SEQ libs as the BLAS_LIBRARIES + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + +endif() + + +if(BLAS_SEQ_LIBRARIES) + set(BLAS_LIBRARIES "${BLAS_SEQ_LIBRARIES}") +endif() + +# extract libs paths +# remark: because it is not given by find_package(BLAS) +set(BLAS_LIBRARY_DIRS "") +string(REPLACE " " ";" BLAS_LIBRARIES "${BLAS_LIBRARIES}") +foreach(blas_lib ${BLAS_LIBRARIES}) + if (EXISTS "${blas_lib}") + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + else() + string(REPLACE "-L" "" blas_lib "${blas_lib}") + if (EXISTS "${blas_lib}") + list(APPEND BLAS_LIBRARY_DIRS "${blas_lib}" ) + else() + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + if (EXISTS "${a_blas_lib_dir}") + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + endif() + endif() + endif() +endforeach() +if (BLAS_LIBRARY_DIRS) + list(REMOVE_DUPLICATES BLAS_LIBRARY_DIRS) +endif () + +# check that BLAS has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +if(BLA_VENDOR MATCHES "Intel*") + if(BLA_VENDOR MATCHES "Intel10_64lp*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is Intel MKL:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS + BLAS_INCLUDE_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() + else() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS + BLAS_INCLUDE_DIRS) + endif() +elseif(BLA_VENDOR MATCHES "ACML*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is ACML:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() +elseif(BLA_VENDOR MATCHES "IBMESSL*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is ESSL:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() +else() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) +endif() diff --git a/cmake/FindHWLOC.cmake b/cmake/FindHWLOC.cmake new file mode 100644 index 000000000..a831b5c72 --- /dev/null +++ b/cmake/FindHWLOC.cmake @@ -0,0 +1,331 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find HWLOC include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(HWLOC +# [REQUIRED]) # Fail with error if hwloc is not found +# +# This module finds headers and hwloc library. +# Results are reported in variables: +# HWLOC_FOUND - True if headers and requested libraries were found +# HWLOC_INCLUDE_DIRS - hwloc include directories +# HWLOC_LIBRARY_DIRS - Link directories for hwloc libraries +# HWLOC_LIBRARIES - hwloc component libraries to be linked +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DHWLOC_DIR=path/to/hwloc): +# HWLOC_DIR - Where to find the base directory of hwloc +# HWLOC_INCDIR - Where to find the header files +# HWLOC_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: HWLOC_DIR, HWLOC_INCDIR, HWLOC_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +include(CheckStructHasMember) +include(CheckCSourceCompiles) + +if (NOT HWLOC_FOUND) + set(HWLOC_DIR "" CACHE PATH "Installation directory of HWLOC library") + if (NOT HWLOC_FIND_QUIETLY) + message(STATUS "A cache variable, namely HWLOC_DIR, has been set to specify the install directory of HWLOC") + endif() +endif() + +set(ENV_HWLOC_DIR "$ENV{HWLOC_DIR}") +set(ENV_HWLOC_INCDIR "$ENV{HWLOC_INCDIR}") +set(ENV_HWLOC_LIBDIR "$ENV{HWLOC_LIBDIR}") +set(HWLOC_GIVEN_BY_USER "FALSE") +if ( HWLOC_DIR OR ( HWLOC_INCDIR AND HWLOC_LIBDIR) OR ENV_HWLOC_DIR OR (ENV_HWLOC_INCDIR AND ENV_HWLOC_LIBDIR) ) + set(HWLOC_GIVEN_BY_USER "TRUE") +endif() + +# Optionally use pkg-config to detect include/library dirs (if pkg-config is available) +# ------------------------------------------------------------------------------------- +include(FindPkgConfig) +find_package(PkgConfig QUIET) +if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) + + pkg_search_module(HWLOC hwloc) + if (NOT HWLOC_FIND_QUIETLY) + if (HWLOC_FOUND AND HWLOC_LIBRARIES) + message(STATUS "Looking for HWLOC - found using PkgConfig") + #if(NOT HWLOC_INCLUDE_DIRS) + # message("${Magenta}HWLOC_INCLUDE_DIRS is empty using PkgConfig." + # "Perhaps the path to hwloc headers is already present in your" + # "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}") + #endif() + else() + message(STATUS "${Magenta}Looking for HWLOC - not found using PkgConfig." + "\n Perhaps you should add the directory containing hwloc.pc to" + "\n the PKG_CONFIG_PATH environment variable.${ColourReset}") + endif() + endif() + +endif( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) + +if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) ) + + if (NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for HWLOC - PkgConfig not used") + endif() + + # Looking for include + # ------------------- + + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + if(ENV_HWLOC_INCDIR) + list(APPEND _inc_env "${ENV_HWLOC_INCDIR}") + elseif(ENV_HWLOC_DIR) + list(APPEND _inc_env "${ENV_HWLOC_DIR}") + list(APPEND _inc_env "${ENV_HWLOC_DIR}/include") + list(APPEND _inc_env "${ENV_HWLOC_DIR}/include/hwloc") + else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") + + # Try to find the hwloc header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(HWLOC_INCDIR) + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${HWLOC_INCDIR}) + else() + if(HWLOC_DIR) + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${HWLOC_DIR} + PATH_SUFFIXES "include" "include/hwloc") + else() + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${PATH_TO_LOOK_FOR} + PATH_SUFFIXES "hwloc") + endif() + endif() + mark_as_advanced(HWLOC_hwloc.h_DIRS) + + # Add path to cmake variable + # ------------------------------------ + if (HWLOC_hwloc.h_DIRS) + set(HWLOC_INCLUDE_DIRS "${HWLOC_hwloc.h_DIRS}") + else () + set(HWLOC_INCLUDE_DIRS "HWLOC_INCLUDE_DIRS-NOTFOUND") + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc -- hwloc.h not found") + endif() + endif () + + if (HWLOC_INCLUDE_DIRS) + list(REMOVE_DUPLICATES HWLOC_INCLUDE_DIRS) + endif () + + + # Looking for lib + # --------------- + + # Add system library paths to search lib + # -------------------------------------- + unset(_lib_env) + if(ENV_HWLOC_LIBDIR) + list(APPEND _lib_env "${ENV_HWLOC_LIBDIR}") + elseif(ENV_HWLOC_DIR) + list(APPEND _lib_env "${ENV_HWLOC_DIR}") + list(APPEND _lib_env "${ENV_HWLOC_DIR}/lib") + else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif() + list(REMOVE_DUPLICATES _lib_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_lib_env}") + + # Try to find the hwloc lib in the given paths + # ---------------------------------------------- + + # call cmake macro to find the lib path + if(HWLOC_LIBDIR) + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${HWLOC_LIBDIR}) + else() + if(HWLOC_DIR) + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${HWLOC_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${PATH_TO_LOOK_FOR}) + endif() + endif() + mark_as_advanced(HWLOC_hwloc_LIBRARY) + + # If found, add path to cmake variable + # ------------------------------------ + if (HWLOC_hwloc_LIBRARY) + get_filename_component(hwloc_lib_path ${HWLOC_hwloc_LIBRARY} PATH) + # set cmake variables (respects naming convention) + set(HWLOC_LIBRARIES "${HWLOC_hwloc_LIBRARY}") + set(HWLOC_LIBRARY_DIRS "${hwloc_lib_path}") + else () + set(HWLOC_LIBRARIES "HWLOC_LIBRARIES-NOTFOUND") + set(HWLOC_LIBRARY_DIRS "HWLOC_LIBRARY_DIRS-NOTFOUND") + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc -- lib hwloc not found") + endif() + endif () + + if (HWLOC_LIBRARY_DIRS) + list(REMOVE_DUPLICATES HWLOC_LIBRARY_DIRS) + endif () + + # check a function to validate the find + if(HWLOC_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # HWLOC + if (HWLOC_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}") + endif() + if (HWLOC_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${HWLOC_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${HWLOC_LIBRARIES}") + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(HWLOC_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(hwloc_topology_init HWLOC_WORKS) + mark_as_advanced(HWLOC_WORKS) + + if(NOT HWLOC_WORKS) + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc : test of hwloc_topology_init with hwloc library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) + endif(HWLOC_LIBRARIES) + +endif( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) ) + +if (HWLOC_LIBRARIES) + if (HWLOC_LIBRARY_DIRS) + list(GET HWLOC_LIBRARY_DIRS 0 first_lib_path) + else() + list(GET HWLOC_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + endif() + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(HWLOC_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of HWLOC library" FORCE) + else() + set(HWLOC_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of HWLOC library" FORCE) + endif() +endif() +mark_as_advanced(HWLOC_DIR) +mark_as_advanced(HWLOC_DIR_FOUND) + +# check that HWLOC has been found +# ------------------------------- +include(FindPackageHandleStandardArgs) +if (PKG_CONFIG_EXECUTABLE AND HWLOC_FOUND) + find_package_handle_standard_args(HWLOC DEFAULT_MSG + HWLOC_LIBRARIES) +else() + find_package_handle_standard_args(HWLOC DEFAULT_MSG + HWLOC_LIBRARIES + HWLOC_WORKS) +endif() + +if (HWLOC_FOUND) + set(HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) + list(APPEND CMAKE_REQUIRED_INCLUDES ${HWLOC_INCLUDE_DIRS}) + + # test headers to guess the version + check_struct_has_member( "struct hwloc_obj" parent hwloc.h HAVE_HWLOC_PARENT_MEMBER ) + check_struct_has_member( "struct hwloc_cache_attr_s" size hwloc.h HAVE_HWLOC_CACHE_ATTR ) + check_c_source_compiles( "#include + int main(void) { hwloc_obj_t o; o->type = HWLOC_OBJ_PU; return 0;}" HAVE_HWLOC_OBJ_PU) + include(CheckLibraryExists) + check_library_exists(${HWLOC_LIBRARIES} hwloc_bitmap_free "" HAVE_HWLOC_BITMAP) + + set(CMAKE_REQUIRED_INCLUDES ${HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES}) +endif() diff --git a/cmake/FindMetis.cmake b/cmake/FindMetis.cmake index 6a0ce790c..da2f1f1d7 100644 --- a/cmake/FindMetis.cmake +++ b/cmake/FindMetis.cmake @@ -1,59 +1,264 @@ -# Pastix requires METIS or METIS (partitioning and reordering tools) +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find METIS include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(METIS +# [REQUIRED] # Fail with error if metis is not found +# ) +# +# This module finds headers and metis library. +# Results are reported in variables: +# METIS_FOUND - True if headers and requested libraries were found +# METIS_INCLUDE_DIRS - metis include directories +# METIS_LIBRARY_DIRS - Link directories for metis libraries +# METIS_LIBRARIES - metis component libraries to be linked +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DMETIS_DIR=path/to/metis): +# METIS_DIR - Where to find the base directory of metis +# METIS_INCDIR - Where to find the header files +# METIS_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: METIS_DIR, METIS_INCDIR, METIS_LIBDIR -if (METIS_INCLUDES AND METIS_LIBRARIES) - set(METIS_FIND_QUIETLY TRUE) -endif (METIS_INCLUDES AND METIS_LIBRARIES) +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) -find_path(METIS_INCLUDES - NAMES - metis.h - PATHS - $ENV{METISDIR} - ${INCLUDE_INSTALL_DIR} - PATH_SUFFIXES - . - metis - include -) - -macro(_metis_check_version) - file(READ "${METIS_INCLUDES}/metis.h" _metis_version_header) - - string(REGEX MATCH "define[ \t]+METIS_VER_MAJOR[ \t]+([0-9]+)" _metis_major_version_match "${_metis_version_header}") - set(METIS_MAJOR_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+METIS_VER_MINOR[ \t]+([0-9]+)" _metis_minor_version_match "${_metis_version_header}") - set(METIS_MINOR_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+METIS_VER_SUBMINOR[ \t]+([0-9]+)" _metis_subminor_version_match "${_metis_version_header}") - set(METIS_SUBMINOR_VERSION "${CMAKE_MATCH_1}") - if(NOT METIS_MAJOR_VERSION) - message(STATUS "Could not determine Metis version. Assuming version 4.0.0") - set(METIS_VERSION 4.0.0) - else() - set(METIS_VERSION ${METIS_MAJOR_VERSION}.${METIS_MINOR_VERSION}.${METIS_SUBMINOR_VERSION}) +if (NOT METIS_FOUND) + set(METIS_DIR "" CACHE PATH "Installation directory of METIS library") + if (NOT METIS_FIND_QUIETLY) + message(STATUS "A cache variable, namely METIS_DIR, has been set to specify the install directory of METIS") endif() - if(${METIS_VERSION} VERSION_LESS ${Metis_FIND_VERSION}) - set(METIS_VERSION_OK FALSE) +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_METIS_DIR "$ENV{METIS_DIR}") +set(ENV_METIS_INCDIR "$ENV{METIS_INCDIR}") +if(ENV_METIS_INCDIR) + list(APPEND _inc_env "${ENV_METIS_INCDIR}") +elseif(ENV_METIS_DIR) + list(APPEND _inc_env "${ENV_METIS_DIR}") + list(APPEND _inc_env "${ENV_METIS_DIR}/include") + list(APPEND _inc_env "${ENV_METIS_DIR}/include/metis") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") else() - set(METIS_VERSION_OK TRUE) + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the metis header in the given paths +# ------------------------------------------------- +# call cmake macro to find the header path +if(METIS_INCDIR) + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${METIS_INCDIR}) +else() + if(METIS_DIR) + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${METIS_DIR} + PATH_SUFFIXES "include" "include/metis") + else() + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${_inc_env}) + endif() +endif() +mark_as_advanced(METIS_metis.h_DIRS) + + +# If found, add path to cmake variable +# ------------------------------------ +if (METIS_metis.h_DIRS) + set(METIS_INCLUDE_DIRS "${METIS_metis.h_DIRS}") +else () + set(METIS_INCLUDE_DIRS "METIS_INCLUDE_DIRS-NOTFOUND") + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for metis -- metis.h not found") + endif() +endif() + + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_METIS_LIBDIR "$ENV{METIS_LIBDIR}") +if(ENV_METIS_LIBDIR) + list(APPEND _lib_env "${ENV_METIS_LIBDIR}") +elseif(ENV_METIS_DIR) + list(APPEND _lib_env "${ENV_METIS_DIR}") + list(APPEND _lib_env "${ENV_METIS_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the metis lib in the given paths +# ---------------------------------------------- +# call cmake macro to find the lib path +if(METIS_LIBDIR) + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${METIS_LIBDIR}) +else() + if(METIS_DIR) + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${METIS_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${_lib_env}) + endif() +endif() +mark_as_advanced(METIS_metis_LIBRARY) + + +# If found, add path to cmake variable +# ------------------------------------ +if (METIS_metis_LIBRARY) + get_filename_component(metis_lib_path "${METIS_metis_LIBRARY}" PATH) + # set cmake variables + set(METIS_LIBRARIES "${METIS_metis_LIBRARY}") + set(METIS_LIBRARY_DIRS "${metis_lib_path}") +else () + set(METIS_LIBRARIES "METIS_LIBRARIES-NOTFOUND") + set(METIS_LIBRARY_DIRS "METIS_LIBRARY_DIRS-NOTFOUND") + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for metis -- lib metis not found") + endif() +endif () + +# check a function to validate the find +if(METIS_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # METIS + if (METIS_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}") + endif() + if (METIS_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${METIS_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${METIS_LIBRARIES}") + # m + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") endif() - if(NOT METIS_VERSION_OK) - message(STATUS "Metis version ${METIS_VERSION} found in ${METIS_INCLUDES}, " - "but at least version ${Metis_FIND_VERSION} is required") - endif(NOT METIS_VERSION_OK) -endmacro(_metis_check_version) + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") - if(METIS_INCLUDES AND Metis_FIND_VERSION) - _metis_check_version() - else() - set(METIS_VERSION_OK TRUE) + # test link + unset(METIS_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(METIS_NodeND METIS_WORKS) + mark_as_advanced(METIS_WORKS) + + if(NOT METIS_WORKS) + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for METIS : test of METIS_NodeND with METIS library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(METIS_LIBRARIES) +if (METIS_LIBRARIES) + list(GET METIS_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(METIS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of METIS library" FORCE) + else() + set(METIS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of METIS library" FORCE) + endif() +endif() +mark_as_advanced(METIS_DIR) +mark_as_advanced(METIS_DIR_FOUND) -find_library(METIS_LIBRARIES metis PATHS $ENV{METISDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib) - +# check that METIS has been found +# --------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(METIS DEFAULT_MSG - METIS_INCLUDES METIS_LIBRARIES METIS_VERSION_OK) - -mark_as_advanced(METIS_INCLUDES METIS_LIBRARIES) + METIS_LIBRARIES + METIS_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/cmake/FindPTSCOTCH.cmake b/cmake/FindPTSCOTCH.cmake new file mode 100644 index 000000000..1396d0582 --- /dev/null +++ b/cmake/FindPTSCOTCH.cmake @@ -0,0 +1,423 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find PTSCOTCH include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(PTSCOTCH +# [REQUIRED] # Fail with error if ptscotch is not found +# [COMPONENTS ...] # dependencies +# ) +# +# PTSCOTCH depends on the following libraries: +# - Threads +# - MPI +# +# COMPONENTS can be some of the following: +# - ESMUMPS: to activate detection of PT-Scotch with the esmumps interface +# +# This module finds headers and ptscotch library. +# Results are reported in variables: +# PTSCOTCH_FOUND - True if headers and requested libraries were found +# PTSCOTCH_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# PTSCOTCH_INCLUDE_DIRS - ptscotch include directories +# PTSCOTCH_LIBRARY_DIRS - Link directories for ptscotch libraries +# PTSCOTCH_LIBRARIES - ptscotch component libraries to be linked +# PTSCOTCH_INCLUDE_DIRS_DEP - ptscotch + dependencies include directories +# PTSCOTCH_LIBRARY_DIRS_DEP - ptscotch + dependencies link directories +# PTSCOTCH_LIBRARIES_DEP - ptscotch libraries + dependencies +# PTSCOTCH_INTSIZE - Number of octets occupied by a SCOTCH_Num +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DPTSCOTCH=path/to/ptscotch): +# PTSCOTCH_DIR - Where to find the base directory of ptscotch +# PTSCOTCH_INCDIR - Where to find the header files +# PTSCOTCH_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: PTSCOTCH_DIR, PTSCOTCH_INCDIR, PTSCOTCH_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +if (NOT PTSCOTCH_FOUND) + set(PTSCOTCH_DIR "" CACHE PATH "Installation directory of PTSCOTCH library") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "A cache variable, namely PTSCOTCH_DIR, has been set to specify the install directory of PTSCOTCH") + endif() +endif() + +# Set the version to find +set(PTSCOTCH_LOOK_FOR_ESMUMPS OFF) + +if( PTSCOTCH_FIND_COMPONENTS ) + foreach( component ${PTSCOTCH_FIND_COMPONENTS} ) + if (${component} STREQUAL "ESMUMPS") + # means we look for esmumps library + set(PTSCOTCH_LOOK_FOR_ESMUMPS ON) + endif() + endforeach() +endif() + +# PTSCOTCH depends on Threads, try to find it +if (NOT THREADS_FOUND) + if (PTSCOTCH_FIND_REQUIRED) + find_package(Threads REQUIRED) + else() + find_package(Threads) + endif() +endif() + +# PTSCOTCH depends on MPI, try to find it +if (NOT MPI_FOUND) + if (PTSCOTCH_FIND_REQUIRED) + find_package(MPI REQUIRED) + else() + find_package(MPI) + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_PTSCOTCH_DIR "$ENV{PTSCOTCH_DIR}") +set(ENV_PTSCOTCH_INCDIR "$ENV{PTSCOTCH_INCDIR}") +if(ENV_PTSCOTCH_INCDIR) + list(APPEND _inc_env "${ENV_PTSCOTCH_INCDIR}") +elseif(ENV_PTSCOTCH_DIR) + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}") + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include") + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include/ptscotch") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the ptscotch header in the given paths +# ------------------------------------------------- + +set(PTSCOTCH_hdrs_to_find "ptscotch.h;scotch.h") + +# call cmake macro to find the header path +if(PTSCOTCH_INCDIR) + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_INCDIR}) + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() +else() + if(PTSCOTCH_DIR) + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() + else() + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() + endif() +endif() + +# If found, add path to cmake variable +# ------------------------------------ +foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + if (PTSCOTCH_${ptscotch_hdr}_DIRS) + list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}") + else () + set(PTSCOTCH_INCLUDE_DIRS "PTSCOTCH_INCLUDE_DIRS-NOTFOUND") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found") + endif() + endif() +endforeach() +list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS) + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_PTSCOTCH_LIBDIR "$ENV{PTSCOTCH_LIBDIR}") +if(ENV_PTSCOTCH_LIBDIR) + list(APPEND _lib_env "${ENV_PTSCOTCH_LIBDIR}") +elseif(ENV_PTSCOTCH_DIR) + list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}") + list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the ptscotch lib in the given paths +# ---------------------------------------------- + +set(PTSCOTCH_libs_to_find "ptscotch;ptscotcherr") +if (PTSCOTCH_LOOK_FOR_ESMUMPS) + list(INSERT PTSCOTCH_libs_to_find 0 "ptesmumps") + list(APPEND PTSCOTCH_libs_to_find "esmumps" ) +endif() +list(APPEND PTSCOTCH_libs_to_find "scotch;scotcherr") + +# call cmake macro to find the lib path +if(PTSCOTCH_LIBDIR) + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_LIBDIR}) + endforeach() +else() + if(PTSCOTCH_DIR) + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +set(PTSCOTCH_LIBRARIES "") +set(PTSCOTCH_LIBRARY_DIRS "") +# If found, add path to cmake variable +# ------------------------------------ +foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + + if (PTSCOTCH_${ptscotch_lib}_LIBRARY) + get_filename_component(${ptscotch_lib}_lib_path "${PTSCOTCH_${ptscotch_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") + list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}") + else () + list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found") + endif() + endif () + + mark_as_advanced(PTSCOTCH_${ptscotch_lib}_LIBRARY) + +endforeach() +list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS) + +# check a function to validate the find +if(PTSCOTCH_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # PTSCOTCH + if (PTSCOTCH_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}") + endif() + if (PTSCOTCH_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${PTSCOTCH_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}") + # MPI + if (MPI_FOUND) + if (MPI_C_INCLUDE_PATH) + list(APPEND CMAKE_REQUIRED_INCLUDES "${MPI_C_INCLUDE_PATH}") + endif() + if (MPI_C_LINK_FLAGS) + if (${MPI_C_LINK_FLAGS} MATCHES " -") + string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + endif() + list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}") + endif() + list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}") + endif() + # THREADS + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + endif() + set(Z_LIBRARY "Z_LIBRARY-NOTFOUND") + find_library(Z_LIBRARY NAMES z) + mark_as_advanced(Z_LIBRARY) + if(Z_LIBRARY) + list(APPEND REQUIRED_LIBS "-lz") + endif() + set(M_LIBRARY "M_LIBRARY-NOTFOUND") + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + set(RT_LIBRARY "RT_LIBRARY-NOTFOUND") + find_library(RT_LIBRARY NAMES rt) + mark_as_advanced(RT_LIBRARY) + if(RT_LIBRARY) + list(APPEND REQUIRED_LIBS "-lrt") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(PTSCOTCH_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(SCOTCH_dgraphInit PTSCOTCH_WORKS) + mark_as_advanced(PTSCOTCH_WORKS) + + if(PTSCOTCH_WORKS) + # save link with dependencies + set(PTSCOTCH_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(PTSCOTCH_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(PTSCOTCH_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(PTSCOTCH_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES PTSCOTCH_LINKER_FLAGS) + else() + if(NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for PTSCOTCH : test of SCOTCH_dgraphInit with PTSCOTCH library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(PTSCOTCH_LIBRARIES) + +if (PTSCOTCH_LIBRARIES) + list(GET PTSCOTCH_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(PTSCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE) + else() + set(PTSCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE) + endif() +endif() +mark_as_advanced(PTSCOTCH_DIR) +mark_as_advanced(PTSCOTCH_DIR_FOUND) + +# Check the size of SCOTCH_Num +# --------------------------------- +set(CMAKE_REQUIRED_INCLUDES ${PTSCOTCH_INCLUDE_DIRS}) + +include(CheckCSourceRuns) +#stdio.h and stdint.h should be included by scotch.h directly +set(PTSCOTCH_C_TEST_SCOTCH_Num_4 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 4) + return 0; + else + return 1; +} +") + +set(PTSCOTCH_C_TEST_SCOTCH_Num_8 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 8) + return 0; + else + return 1; +} +") +check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_4}" PTSCOTCH_Num_4) +if(NOT PTSCOTCH_Num_4) + check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_8}" PTSCOTCH_Num_8) + if(NOT PTSCOTCH_Num_8) + set(PTSCOTCH_INTSIZE -1) + else() + set(PTSCOTCH_INTSIZE 8) + endif() +else() + set(PTSCOTCH_INTSIZE 4) +endif() +set(CMAKE_REQUIRED_INCLUDES "") + +# check that PTSCOTCH has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PTSCOTCH DEFAULT_MSG + PTSCOTCH_LIBRARIES + PTSCOTCH_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/cmake/FindPastix.cmake b/cmake/FindPastix.cmake index e2e6c810d..470477fdc 100644 --- a/cmake/FindPastix.cmake +++ b/cmake/FindPastix.cmake @@ -1,25 +1,704 @@ -# Pastix lib requires linking to a blas library. -# It is up to the user of this module to find a BLAS and link to it. -# Pastix requires SCOTCH or METIS (partitioning and reordering tools) as well +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find PASTIX include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(PASTIX +# [REQUIRED] # Fail with error if pastix is not found +# [COMPONENTS ...] # dependencies +# ) +# +# PASTIX depends on the following libraries: +# - Threads, m, rt +# - MPI +# - HWLOC +# - BLAS +# +# COMPONENTS are optional libraries PASTIX could be linked with, +# Use it to drive detection of a specific compilation chain +# COMPONENTS can be some of the following: +# - MPI: to activate detection of the parallel MPI version (default) +# it looks for Threads, HWLOC, BLAS, MPI and ScaLAPACK libraries +# - SEQ: to activate detection of the sequential version (exclude MPI version) +# - STARPU: to activate detection of StarPU version +# it looks for MPI version of StarPU (default behaviour) +# if SEQ and STARPU are given, it looks for a StarPU without MPI +# - STARPU_CUDA: to activate detection of StarPU with CUDA +# - STARPU_FXT: to activate detection of StarPU with FxT +# - SCOTCH: to activate detection of PASTIX linked with SCOTCH +# - PTSCOTCH: to activate detection of PASTIX linked with SCOTCH +# - METIS: to activate detection of PASTIX linked with SCOTCH +# +# This module finds headers and pastix library. +# Results are reported in variables: +# PASTIX_FOUND - True if headers and requested libraries were found +# PASTIX_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# PASTIX_INCLUDE_DIRS - pastix include directories +# PASTIX_LIBRARY_DIRS - Link directories for pastix libraries +# PASTIX_LIBRARIES - pastix libraries +# PASTIX_INCLUDE_DIRS_DEP - pastix + dependencies include directories +# PASTIX_LIBRARY_DIRS_DEP - pastix + dependencies link directories +# PASTIX_LIBRARIES_DEP - pastix libraries + dependencies +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DPASTIX_DIR=path/to/pastix): +# PASTIX_DIR - Where to find the base directory of pastix +# PASTIX_INCDIR - Where to find the header files +# PASTIX_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: PASTIX_DIR, PASTIX_INCDIR, PASTIX_LIBDIR -if (PASTIX_INCLUDES AND PASTIX_LIBRARIES) - set(PASTIX_FIND_QUIETLY TRUE) -endif (PASTIX_INCLUDES AND PASTIX_LIBRARIES) - -find_path(PASTIX_INCLUDES - NAMES - pastix_nompi.h - PATHS - $ENV{PASTIXDIR} - ${INCLUDE_INSTALL_DIR} -) - -find_library(PASTIX_LIBRARIES pastix PATHS $ENV{PASTIXDIR} ${LIB_INSTALL_DIR}) +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) +if (NOT PASTIX_FOUND) + set(PASTIX_DIR "" CACHE PATH "Installation directory of PASTIX library") + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "A cache variable, namely PASTIX_DIR, has been set to specify the install directory of PASTIX") + endif() +endif() +# Set the version to find +set(PASTIX_LOOK_FOR_MPI ON) +set(PASTIX_LOOK_FOR_SEQ OFF) +set(PASTIX_LOOK_FOR_STARPU OFF) +set(PASTIX_LOOK_FOR_STARPU_CUDA OFF) +set(PASTIX_LOOK_FOR_STARPU_FXT OFF) +set(PASTIX_LOOK_FOR_SCOTCH ON) +set(PASTIX_LOOK_FOR_PTSCOTCH OFF) +set(PASTIX_LOOK_FOR_METIS OFF) + +if( PASTIX_FIND_COMPONENTS ) + foreach( component ${PASTIX_FIND_COMPONENTS} ) + if (${component} STREQUAL "SEQ") + # means we look for the sequential version of PaStiX (without MPI) + set(PASTIX_LOOK_FOR_SEQ ON) + set(PASTIX_LOOK_FOR_MPI OFF) + endif() + if (${component} STREQUAL "MPI") + # means we look for the MPI version of PaStiX (default) + set(PASTIX_LOOK_FOR_SEQ OFF) + set(PASTIX_LOOK_FOR_MPI ON) + endif() + if (${component} STREQUAL "STARPU") + # means we look for PaStiX with StarPU + set(PASTIX_LOOK_FOR_STARPU ON) + endif() + if (${component} STREQUAL "STARPU_CUDA") + # means we look for PaStiX with StarPU + CUDA + set(PASTIX_LOOK_FOR_STARPU ON) + set(PASTIX_LOOK_FOR_STARPU_CUDA ON) + endif() + if (${component} STREQUAL "STARPU_FXT") + # means we look for PaStiX with StarPU + FxT + set(PASTIX_LOOK_FOR_STARPU_FXT ON) + endif() + if (${component} STREQUAL "SCOTCH") + set(PASTIX_LOOK_FOR_SCOTCH ON) + endif() + if (${component} STREQUAL "SCOTCH") + set(PASTIX_LOOK_FOR_PTSCOTCH ON) + endif() + if (${component} STREQUAL "METIS") + set(PASTIX_LOOK_FOR_METIS ON) + endif() + endforeach() +endif() + +# Dependencies detection +# ---------------------- + + +# Required dependencies +# --------------------- + +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect pthread") +endif() +if (PASTIX_FIND_REQUIRED) + find_package(Threads REQUIRED QUIET) +else() + find_package(Threads QUIET) +endif() +set(PASTIX_EXTRA_LIBRARIES "") +if( THREADS_FOUND ) + list(APPEND PASTIX_EXTRA_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) +endif () + +# Add math library to the list of extra +# it normally exists on all common systems provided with a C compiler +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect libm") +endif() +set(PASTIX_M_LIBRARIES "") +if(UNIX OR WIN32) + find_library( + PASTIX_M_m_LIBRARY + NAMES m + ) + mark_as_advanced(PASTIX_M_m_LIBRARY) + if (PASTIX_M_m_LIBRARY) + list(APPEND PASTIX_M_LIBRARIES "${PASTIX_M_m_LIBRARY}") + list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_M_m_LIBRARY}") + else() + if (PASTIX_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find libm on your system." + "Are you sure to a have a C compiler installed?") + endif() + endif() +endif() + +# Try to find librt (libposix4 - POSIX.1b Realtime Extensions library) +# on Unix systems except Apple ones because it does not exist on it +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect librt") +endif() +set(PASTIX_RT_LIBRARIES "") +if(UNIX AND NOT APPLE) + find_library( + PASTIX_RT_rt_LIBRARY + NAMES rt + ) + mark_as_advanced(PASTIX_RT_rt_LIBRARY) + if (PASTIX_RT_rt_LIBRARY) + list(APPEND PASTIX_RT_LIBRARIES "${PASTIX_RT_rt_LIBRARY}") + list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_RT_rt_LIBRARY}") + else() + if (PASTIX_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find librt on your system") + endif() + endif() +endif() + +# PASTIX depends on HWLOC +#------------------------ +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect HWLOC") +endif() +if (PASTIX_FIND_REQUIRED) + find_package(HWLOC REQUIRED QUIET) +else() + find_package(HWLOC QUIET) +endif() + +# PASTIX depends on BLAS +#----------------------- +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect BLAS") +endif() +if (PASTIX_FIND_REQUIRED) + find_package(BLASEXT REQUIRED QUIET) +else() + find_package(BLASEXT QUIET) +endif() + +# Optional dependencies +# --------------------- + +# PASTIX may depend on MPI +#------------------------- +if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect MPI") + endif() + # allows to use an external mpi compilation by setting compilers with + # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90 + # at cmake configure + if(NOT MPI_C_COMPILER) + set(MPI_C_COMPILER mpicc) + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI) + find_package(MPI REQUIRED QUIET) + else() + find_package(MPI QUIET) + endif() + if (MPI_FOUND) + mark_as_advanced(MPI_LIBRARY) + mark_as_advanced(MPI_EXTRA_LIBRARY) + endif() +endif (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) + +# PASTIX may depend on STARPU +#---------------------------- +if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) + + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect StarPU") + endif() + + set(PASTIX_STARPU_VERSION "1.1" CACHE STRING "oldest STARPU version desired") + + # create list of components in order to make a single call to find_package(starpu...) + # we explicitly need a StarPU version built with hwloc + set(STARPU_COMPONENT_LIST "HWLOC") + + # StarPU may depend on MPI + # allows to use an external mpi compilation by setting compilers with + # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90 + # at cmake configure + if (PASTIX_LOOK_FOR_MPI) + if(NOT MPI_C_COMPILER) + set(MPI_C_COMPILER mpicc) + endif() + list(APPEND STARPU_COMPONENT_LIST "MPI") + endif() + if (PASTIX_LOOK_FOR_STARPU_CUDA) + list(APPEND STARPU_COMPONENT_LIST "CUDA") + endif() + if (PASTIX_LOOK_FOR_STARPU_FXT) + list(APPEND STARPU_COMPONENT_LIST "FXT") + endif() + # set the list of optional dependencies we may discover + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU) + find_package(STARPU ${PASTIX_STARPU_VERSION} REQUIRED + COMPONENTS ${STARPU_COMPONENT_LIST}) + else() + find_package(STARPU ${PASTIX_STARPU_VERSION} + COMPONENTS ${STARPU_COMPONENT_LIST}) + endif() + +endif( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) + +# PASTIX may depends on SCOTCH +#----------------------------- +if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect SCOTCH") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH) + find_package(SCOTCH REQUIRED QUIET) + else() + find_package(SCOTCH QUIET) + endif() +endif() + +# PASTIX may depends on PTSCOTCH +#------------------------------- +if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH) + find_package(PTSCOTCH REQUIRED QUIET) + else() + find_package(PTSCOTCH QUIET) + endif() +endif() + +# PASTIX may depends on METIS +#---------------------------- +if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect METIS") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS) + find_package(METIS REQUIRED QUIET) + else() + find_package(METIS QUIET) + endif() +endif() + +# Error if pastix required and no partitioning lib found +if (PASTIX_FIND_REQUIRED AND NOT SCOTCH_FOUND AND NOT PTSCOTCH_FOUND AND NOT METIS_FOUND) + message(FATAL_ERROR "Could NOT find any partitioning library on your system" + " (install scotch, ptscotch or metis)") +endif() + + +# Looking for PaStiX +# ------------------ + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_PASTIX_DIR "$ENV{PASTIX_DIR}") +set(ENV_PASTIX_INCDIR "$ENV{PASTIX_INCDIR}") +if(ENV_PASTIX_INCDIR) + list(APPEND _inc_env "${ENV_PASTIX_INCDIR}") +elseif(ENV_PASTIX_DIR) + list(APPEND _inc_env "${ENV_PASTIX_DIR}") + list(APPEND _inc_env "${ENV_PASTIX_DIR}/include") + list(APPEND _inc_env "${ENV_PASTIX_DIR}/include/pastix") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the pastix header in the given paths +# --------------------------------------------------- +# call cmake macro to find the header path +if(PASTIX_INCDIR) + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${PASTIX_INCDIR}) +else() + if(PASTIX_DIR) + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${PASTIX_DIR} + PATH_SUFFIXES "include" "include/pastix") + else() + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${_inc_env} + PATH_SUFFIXES "pastix") + endif() +endif() +mark_as_advanced(PASTIX_pastix.h_DIRS) + +# If found, add path to cmake variable +# ------------------------------------ +if (PASTIX_pastix.h_DIRS) + set(PASTIX_INCLUDE_DIRS "${PASTIX_pastix.h_DIRS}") +else () + set(PASTIX_INCLUDE_DIRS "PASTIX_INCLUDE_DIRS-NOTFOUND") + if(NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for pastix -- pastix.h not found") + endif() +endif() + + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_PASTIX_LIBDIR "$ENV{PASTIX_LIBDIR}") +if(ENV_PASTIX_LIBDIR) + list(APPEND _lib_env "${ENV_PASTIX_LIBDIR}") +elseif(ENV_PASTIX_DIR) + list(APPEND _lib_env "${ENV_PASTIX_DIR}") + list(APPEND _lib_env "${ENV_PASTIX_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the pastix lib in the given paths +# ------------------------------------------------ + +# create list of libs to find +set(PASTIX_libs_to_find "pastix_murge;pastix") + +# call cmake macro to find the lib path +if(PASTIX_LIBDIR) + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${PASTIX_LIBDIR}) + endforeach() +else() + if(PASTIX_DIR) + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${PASTIX_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +# If found, add path to cmake variable +# ------------------------------------ +foreach(pastix_lib ${PASTIX_libs_to_find}) + + get_filename_component(${pastix_lib}_lib_path ${PASTIX_${pastix_lib}_LIBRARY} PATH) + # set cmake variables (respects naming convention) + if (PASTIX_LIBRARIES) + list(APPEND PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}") + else() + set(PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}") + endif() + if (PASTIX_LIBRARY_DIRS) + list(APPEND PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}") + else() + set(PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}") + endif() + mark_as_advanced(PASTIX_${pastix_lib}_LIBRARY) + +endforeach(pastix_lib ${PASTIX_libs_to_find}) + +# check a function to validate the find +if(PASTIX_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # PASTIX + if (PASTIX_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${PASTIX_INCLUDE_DIRS}") + endif() + foreach(libdir ${PASTIX_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + set(REQUIRED_LIBS "${PASTIX_LIBRARIES}") + # STARPU + if (PASTIX_LOOK_FOR_STARPU AND STARPU_FOUND) + if (STARPU_INCLUDE_DIRS_DEP) + list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS_DEP}") + elseif (STARPU_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS}") + endif() + if(STARPU_LIBRARY_DIRS_DEP) + list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS_DEP}") + elseif(STARPU_LIBRARY_DIRS) + list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS}") + endif() + if (STARPU_LIBRARIES_DEP) + list(APPEND REQUIRED_LIBS "${STARPU_LIBRARIES_DEP}") + elseif (STARPU_LIBRARIES) + foreach(lib ${STARPU_LIBRARIES}) + if (EXISTS ${lib} OR ${lib} MATCHES "^-") + list(APPEND REQUIRED_LIBS "${lib}") + else() + list(APPEND REQUIRED_LIBS "-l${lib}") + endif() + endforeach() + endif() + endif() + # CUDA + if (PASTIX_LOOK_FOR_STARPU_CUDA AND CUDA_FOUND) + if (CUDA_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${CUDA_INCLUDE_DIRS}") + endif() + foreach(libdir ${CUDA_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES}") + endif() + # MPI + if (PASTIX_LOOK_FOR_MPI AND MPI_FOUND) + if (MPI_C_INCLUDE_PATH) + list(APPEND REQUIRED_INCDIRS "${MPI_C_INCLUDE_PATH}") + endif() + if (MPI_C_LINK_FLAGS) + if (${MPI_C_LINK_FLAGS} MATCHES " -") + string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + endif() + list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}") + endif() + list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}") + endif() + # HWLOC + if (HWLOC_FOUND) + if (HWLOC_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}") + endif() + foreach(libdir ${HWLOC_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + foreach(lib ${HWLOC_LIBRARIES}) + if (EXISTS ${lib} OR ${lib} MATCHES "^-") + list(APPEND REQUIRED_LIBS "${lib}") + else() + list(APPEND REQUIRED_LIBS "-l${lib}") + endif() + endforeach() + endif() + # BLAS + if (BLAS_FOUND) + if (BLAS_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${BLAS_INCLUDE_DIRS}") + endif() + foreach(libdir ${BLAS_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${BLAS_LIBRARIES}") + if (BLAS_LINKER_FLAGS) + list(APPEND REQUIRED_LDFLAGS "${BLAS_LINKER_FLAGS}") + endif() + endif() + # SCOTCH + if (PASTIX_LOOK_FOR_SCOTCH AND SCOTCH_FOUND) + if (SCOTCH_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}") + endif() + foreach(libdir ${SCOTCH_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${SCOTCH_LIBRARIES}") + endif() + # PTSCOTCH + if (PASTIX_LOOK_FOR_PTSCOTCH AND PTSCOTCH_FOUND) + if (PTSCOTCH_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}") + endif() + foreach(libdir ${PTSCOTCH_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}") + endif() + # METIS + if (PASTIX_LOOK_FOR_METIS AND METIS_FOUND) + if (METIS_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}") + endif() + foreach(libdir ${METIS_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${METIS_LIBRARIES}") + endif() + # Fortran + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_ifcore_LIBRARY}") + endif() + endif() + # EXTRA LIBS such that pthread, m, rt + list(APPEND REQUIRED_LIBS ${PASTIX_EXTRA_LIBRARIES}) + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(PASTIX_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(pastix PASTIX_WORKS) + mark_as_advanced(PASTIX_WORKS) + + if(PASTIX_WORKS) + # save link with dependencies + set(PASTIX_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(PASTIX_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(PASTIX_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(PASTIX_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES PASTIX_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES PASTIX_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES PASTIX_LINKER_FLAGS) + else() + if(NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX : test of pastix() fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + message(STATUS "Maybe PASTIX is linked with specific libraries. " + "Have you tried with COMPONENTS (MPI/SEQ, STARPU, STARPU_CUDA, SCOTCH, PTSCOTCH, METIS)? " + "See the explanation in FindPASTIX.cmake.") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(PASTIX_LIBRARIES) + +if (PASTIX_LIBRARIES) + list(GET PASTIX_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(PASTIX_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PASTIX library" FORCE) + else() + set(PASTIX_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PASTIX library" FORCE) + endif() +endif() +mark_as_advanced(PASTIX_DIR) +mark_as_advanced(PASTIX_DIR_FOUND) + +# check that PASTIX has been found +# --------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(PASTIX DEFAULT_MSG - PASTIX_INCLUDES PASTIX_LIBRARIES) - -mark_as_advanced(PASTIX_INCLUDES PASTIX_LIBRARIES) + PASTIX_LIBRARIES + PASTIX_WORKS) diff --git a/cmake/FindScotch.cmake b/cmake/FindScotch.cmake index 530340b16..89d295ac2 100644 --- a/cmake/FindScotch.cmake +++ b/cmake/FindScotch.cmake @@ -1,24 +1,369 @@ -# Pastix requires SCOTCH or METIS (partitioning and reordering tools) +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find SCOTCH include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(SCOTCH +# [REQUIRED] # Fail with error if scotch is not found +# [COMPONENTS ...] # dependencies +# ) +# +# COMPONENTS can be some of the following: +# - ESMUMPS: to activate detection of Scotch with the esmumps interface +# +# This module finds headers and scotch library. +# Results are reported in variables: +# SCOTCH_FOUND - True if headers and requested libraries were found +# SCOTCH_INCLUDE_DIRS - scotch include directories +# SCOTCH_LIBRARY_DIRS - Link directories for scotch libraries +# SCOTCH_LIBRARIES - scotch component libraries to be linked +# SCOTCH_INTSIZE - Number of octets occupied by a SCOTCH_Num +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DSCOTCH=path/to/scotch): +# SCOTCH_DIR - Where to find the base directory of scotch +# SCOTCH_INCDIR - Where to find the header files +# SCOTCH_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: SCOTCH_DIR, SCOTCH_INCDIR, SCOTCH_LIBDIR -if (SCOTCH_INCLUDES AND SCOTCH_LIBRARIES) - set(SCOTCH_FIND_QUIETLY TRUE) -endif (SCOTCH_INCLUDES AND SCOTCH_LIBRARIES) +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) -find_path(SCOTCH_INCLUDES - NAMES - scotch.h - PATHS - $ENV{SCOTCHDIR} - ${INCLUDE_INSTALL_DIR} - PATH_SUFFIXES - scotch -) +if (NOT SCOTCH_FOUND) + set(SCOTCH_DIR "" CACHE PATH "Installation directory of SCOTCH library") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "A cache variable, namely SCOTCH_DIR, has been set to specify the install directory of SCOTCH") + endif() +endif() + +# Set the version to find +set(SCOTCH_LOOK_FOR_ESMUMPS OFF) + +if( SCOTCH_FIND_COMPONENTS ) + foreach( component ${SCOTCH_FIND_COMPONENTS} ) + if (${component} STREQUAL "ESMUMPS") + # means we look for esmumps library + set(SCOTCH_LOOK_FOR_ESMUMPS ON) + endif() + endforeach() +endif() + +# SCOTCH may depend on Threads, try to find it +if (NOT THREADS_FOUND) + if (SCOTCH_FIND_REQUIRED) + find_package(Threads REQUIRED) + else() + find_package(Threads) + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_SCOTCH_DIR "$ENV{SCOTCH_DIR}") +set(ENV_SCOTCH_INCDIR "$ENV{SCOTCH_INCDIR}") +if(ENV_SCOTCH_INCDIR) + list(APPEND _inc_env "${ENV_SCOTCH_INCDIR}") +elseif(ENV_SCOTCH_DIR) + list(APPEND _inc_env "${ENV_SCOTCH_DIR}") + list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include") + list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include/scotch") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) -find_library(SCOTCH_LIBRARIES scotch PATHS $ENV{SCOTCHDIR} ${LIB_INSTALL_DIR}) +# Try to find the scotch header in the given paths +# ------------------------------------------------- +# call cmake macro to find the header path +if(SCOTCH_INCDIR) + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${SCOTCH_INCDIR}) +else() + if(SCOTCH_DIR) + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${SCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") + else() + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") + endif() +endif() +mark_as_advanced(SCOTCH_scotch.h_DIRS) +# If found, add path to cmake variable +# ------------------------------------ +if (SCOTCH_scotch.h_DIRS) + set(SCOTCH_INCLUDE_DIRS "${SCOTCH_scotch.h_DIRS}") +else () + set(SCOTCH_INCLUDE_DIRS "SCOTCH_INCLUDE_DIRS-NOTFOUND") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for scotch -- scotch.h not found") + endif() +endif() +list(REMOVE_DUPLICATES SCOTCH_INCLUDE_DIRS) + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_SCOTCH_LIBDIR "$ENV{SCOTCH_LIBDIR}") +if(ENV_SCOTCH_LIBDIR) + list(APPEND _lib_env "${ENV_SCOTCH_LIBDIR}") +elseif(ENV_SCOTCH_DIR) + list(APPEND _lib_env "${ENV_SCOTCH_DIR}") + list(APPEND _lib_env "${ENV_SCOTCH_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the scotch lib in the given paths +# ---------------------------------------------- + +set(SCOTCH_libs_to_find "scotch;scotcherrexit") +if (SCOTCH_LOOK_FOR_ESMUMPS) + list(INSERT SCOTCH_libs_to_find 0 "esmumps") +endif() + +# call cmake macro to find the lib path +if(SCOTCH_LIBDIR) + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${SCOTCH_LIBDIR}) + endforeach() +else() + if(SCOTCH_DIR) + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${SCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +set(SCOTCH_LIBRARIES "") +set(SCOTCH_LIBRARY_DIRS "") +# If found, add path to cmake variable +# ------------------------------------ +foreach(scotch_lib ${SCOTCH_libs_to_find}) + + if (SCOTCH_${scotch_lib}_LIBRARY) + get_filename_component(${scotch_lib}_lib_path "${SCOTCH_${scotch_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}") + list(APPEND SCOTCH_LIBRARY_DIRS "${${scotch_lib}_lib_path}") + else () + list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for scotch -- lib ${scotch_lib} not found") + endif() + endif () + + mark_as_advanced(SCOTCH_${scotch_lib}_LIBRARY) + +endforeach() +list(REMOVE_DUPLICATES SCOTCH_LIBRARY_DIRS) + +# check a function to validate the find +if(SCOTCH_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # SCOTCH + if (SCOTCH_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}") + endif() + if (SCOTCH_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${SCOTCH_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${SCOTCH_LIBRARIES}") + # THREADS + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + endif() + set(Z_LIBRARY "Z_LIBRARY-NOTFOUND") + find_library(Z_LIBRARY NAMES z) + mark_as_advanced(Z_LIBRARY) + if(Z_LIBRARY) + list(APPEND REQUIRED_LIBS "-lz") + endif() + set(M_LIBRARY "M_LIBRARY-NOTFOUND") + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + set(RT_LIBRARY "RT_LIBRARY-NOTFOUND") + find_library(RT_LIBRARY NAMES rt) + mark_as_advanced(RT_LIBRARY) + if(RT_LIBRARY) + list(APPEND REQUIRED_LIBS "-lrt") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(SCOTCH_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(SCOTCH_graphInit SCOTCH_WORKS) + mark_as_advanced(SCOTCH_WORKS) + + if(SCOTCH_WORKS) + # save link with dependencies + set(SCOTCH_LIBRARIES "${REQUIRED_LIBS}") + else() + if(NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for SCOTCH : test of SCOTCH_graphInit with SCOTCH library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(SCOTCH_LIBRARIES) + +if (SCOTCH_LIBRARIES) + list(GET SCOTCH_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(SCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of SCOTCH library" FORCE) + else() + set(SCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of SCOTCH library" FORCE) + endif() +endif() +mark_as_advanced(SCOTCH_DIR) +mark_as_advanced(SCOTCH_DIR_FOUND) + +# Check the size of SCOTCH_Num +# --------------------------------- +set(CMAKE_REQUIRED_INCLUDES ${SCOTCH_INCLUDE_DIRS}) + +include(CheckCSourceRuns) +#stdio.h and stdint.h should be included by scotch.h directly +set(SCOTCH_C_TEST_SCOTCH_Num_4 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 4) + return 0; + else + return 1; +} +") + +set(SCOTCH_C_TEST_SCOTCH_Num_8 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 8) + return 0; + else + return 1; +} +") +check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_4}" SCOTCH_Num_4) +if(NOT SCOTCH_Num_4) + check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_8}" SCOTCH_Num_8) + if(NOT SCOTCH_Num_8) + set(SCOTCH_INTSIZE -1) + else() + set(SCOTCH_INTSIZE 8) + endif() +else() + set(SCOTCH_INTSIZE 4) +endif() +set(CMAKE_REQUIRED_INCLUDES "") + +# check that SCOTCH has been found +# --------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(SCOTCH DEFAULT_MSG - SCOTCH_INCLUDES SCOTCH_LIBRARIES) - -mark_as_advanced(SCOTCH_INCLUDES SCOTCH_LIBRARIES) + SCOTCH_LIBRARIES + SCOTCH_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d337594f5..8da51ce57 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -27,7 +27,7 @@ endif() if(NOT EIGEN_Fortran_COMPILER_WORKS) # search for a default Lapack library to complete Eigen's one - find_package(LAPACK) + find_package(LAPACK QUIET) endif() # configure blas/lapack (use Eigen's ones) @@ -80,23 +80,30 @@ else() endif() -find_package(Pastix) -find_package(Scotch) -find_package(Metis 5.0 REQUIRED) -if(PASTIX_FOUND) +find_package(PASTIX QUIET COMPONENTS METIS SCOTCH) +# check that the PASTIX found is a version without MPI +find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS + NAMES pastix_nompi.h + HINTS ${PASTIX_INCLUDE_DIRS} +) +if (NOT PASTIX_pastix_nompi.h_INCLUDE_DIRS) + message(STATUS "A version of Pastix has been found but pastix_nompi.h does not exist in the include directory." + " Because Eigen tests require a version without MPI, we disable the Pastix backend.") +endif() +if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS) add_definitions("-DEIGEN_PASTIX_SUPPORT") - include_directories(${PASTIX_INCLUDES}) + include_directories(${PASTIX_INCLUDE_DIRS_DEP}) if(SCOTCH_FOUND) - include_directories(${SCOTCH_INCLUDES}) + include_directories(${SCOTCH_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${SCOTCH_LIBRARIES}) elseif(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES}) else(SCOTCH_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) ei_add_property(EIGEN_TESTED_BACKENDS "PaStiX, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") @@ -104,7 +111,7 @@ endif() if(METIS_FOUND) add_definitions("-DEIGEN_METIS_SUPPORT") - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) ei_add_property(EIGEN_TESTED_BACKENDS "METIS, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "METIS, ") From f75dfdda7e6f133c4d45671efb6799dbb975b1ef Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 14 Apr 2017 10:34:30 +0200 Subject: [PATCH 094/680] Fix unwanted Real to Scalar to Real conversions in column-pivoting QR. --- Eigen/src/QR/ColPivHouseholderQR.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index d35395d04..1b828f5dc 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -505,8 +505,8 @@ void ColPivHouseholderQR::computeInPlace() m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k); } - RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); - RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); + RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); + RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) m_maxpivot = RealScalar(0); @@ -556,8 +556,8 @@ void ColPivHouseholderQR::computeInPlace() RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); temp = temp < 0 ? 0 : temp; - RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / - m_colNormsDirect.coeffRef(j)); + RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / + m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { // The updated norm has become too inaccurate so re-compute the column // norm directly. From d9084ac8e142697f0d767092a17ffc3a7a18a2e4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 14 Apr 2017 11:05:13 +0200 Subject: [PATCH 095/680] Improve mixing of complex and real in the vectorized path of apply_rotation_in_the_plane --- Eigen/src/Jacobi/Jacobi.h | 42 +++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index d25af8e90..c30326e1d 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -302,8 +302,12 @@ template void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - enum { PacketSize = packet_traits::size }; + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; eigen_assert(xpr_x.size() == xpr_y.size()); Index size = xpr_x.size(); Index incrx = xpr_x.derived().innerStride(); @@ -321,6 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x if(VectorX::SizeAtCompileTime == Dynamic && (VectorX::Flags & VectorY::Flags & PacketAccessBit) && + (PacketSize == OtherPacketSize) && ((incrx==1 && incry==1) || PacketSize == 1)) { // both vectors are sequentially stored in memory => vectorization @@ -329,9 +334,10 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Index alignedStart = internal::first_default_aligned(y, size); Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; for(Index i=0; i& xpr_x { Packet xi = pload(px); Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); px += PacketSize; py += PacketSize; } @@ -365,10 +371,10 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Packet xi1 = ploadu(px+PacketSize); Packet yi = pload (py); Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1))); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); px += Peeling*PacketSize; py += Peeling*PacketSize; } @@ -376,8 +382,8 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x { Packet xi = ploadu(x+peelingEnd); Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); } } @@ -393,19 +399,21 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x /*** fixed-size vectorized path ***/ else if(VectorX::SizeAtCompileTime != Dynamic && (VectorX::Flags & VectorY::Flags & PacketAccessBit) && + (PacketSize == OtherPacketSize) && (EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment { - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; Scalar* EIGEN_RESTRICT px = x; Scalar* EIGEN_RESTRICT py = y; for(Index i=0; i(px); Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); px += PacketSize; py += PacketSize; } From 949a2da38cbfebe358a25dc59b47abb67beb4126 Mon Sep 17 00:00:00 2001 From: RJ Ryan Date: Fri, 14 Apr 2017 13:23:35 -0700 Subject: [PATCH 096/680] Use scalar_sum_op and scalar_quotient_op instead of operator+ and operator/ in MeanReducer. Improves support for std::complex types when compiling for CUDA. Expands on e2e9cdd16970914cf0a892fea5e7c4402b3ede41 and 2bda1b0d93fb627d0c500ec48b20302d44c32cb7 . --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 8 +++-- unsupported/test/cxx11_tensor_complex_cuda.cu | 36 +++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3b4f8eda1..5dcc3794c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -166,7 +166,8 @@ template struct MeanReducer return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / scalarCount_; + internal::scalar_quotient_op quotient_op; + return quotient_op(accum, T(scalarCount_)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { @@ -175,7 +176,10 @@ template struct MeanReducer template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); + internal::scalar_quotient_op quotient_op; + return quotient_op( + sum_op(saccum, predux(vaccum)), + T(scalarCount_ + packetCount_ * unpacket_traits::size)); } protected: diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index d4e111f5d..87cf28920 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -107,6 +107,41 @@ static void test_cuda_sum_reductions() { gpu_device.deallocate(gpu_out_ptr); } +static void test_cuda_mean_reductions() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random(1024, 5*1024); + const int num_cols = internal::random(1024, 5*1024); + + Tensor, 2> in(num_rows, num_cols); + in.setRandom(); + + Tensor, 0> full_redux; + full_redux = in.mean(); + + std::size_t in_bytes = in.size() * sizeof(std::complex); + std::size_t out_bytes = full_redux.size() * sizeof(std::complex); + std::complex* gpu_in_ptr = static_cast*>(gpu_device.allocate(in_bytes)); + std::complex* gpu_out_ptr = static_cast*>(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap, 0> > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.mean(); + + Tensor, 0> full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); +} static void test_cuda_product_reductions() { @@ -149,5 +184,6 @@ void test_cxx11_tensor_complex() { CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_sum_reductions()); + CALL_SUBTEST(test_cuda_mean_reductions()); CALL_SUBTEST(test_cuda_product_reductions()); } From 891ac03483dd282569581076d4e0819608fc1155 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 25 Apr 2017 13:58:10 +0200 Subject: [PATCH 097/680] Fix dense * sparse-selfadjoint-view product. --- Eigen/src/SparseCore/SparseSelfAdjointView.h | 3 ++- test/sparse_product.cpp | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 9e39be738..5ab64f1a8 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -47,6 +47,7 @@ template class SparseSelfAdjointView enum { Mode = _Mode, + TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0), RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime }; @@ -368,7 +369,7 @@ struct generic_product_impl dstT(dst); - internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); + internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); } }; diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp index c1edd26e3..197586741 100644 --- a/test/sparse_product.cpp +++ b/test/sparse_product.cpp @@ -297,6 +297,10 @@ template void sparse_product() VERIFY_IS_APPROX(x=mLo.template selfadjointView()*b, refX=refS*b); VERIFY_IS_APPROX(x=mS.template selfadjointView()*b, refX=refS*b); + VERIFY_IS_APPROX(x=b * mUp.template selfadjointView(), refX=b*refS); + VERIFY_IS_APPROX(x=b * mLo.template selfadjointView(), refX=b*refS); + VERIFY_IS_APPROX(x=b * mS.template selfadjointView(), refX=b*refS); + VERIFY_IS_APPROX(x.noalias()+=mUp.template selfadjointView()*b, refX+=refS*b); VERIFY_IS_APPROX(x.noalias()-=mLo.template selfadjointView()*b, refX-=refS*b); VERIFY_IS_APPROX(x.noalias()+=mS.template selfadjointView()*b, refX+=refS*b); From 9bc0a35731d914a54a6fa7215d2ba0aeb53194a9 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Thu, 27 Apr 2017 03:09:03 -0400 Subject: [PATCH 098/680] Fixed nested angle barckets >> issue when compiling with cuda 8 --- Eigen/src/Core/arch/CUDA/Complex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index 9c2536509..ca0aaed32 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -55,7 +55,7 @@ template struct scalar_difference_op, std::complex struct scalar_product_op, const std::complex > : binary_op_base, const std::complex > { enum { - Vectorizable = packet_traits>::HasMul + Vectorizable = packet_traits >::HasMul }; typedef typename std::complex result_type; @@ -76,7 +76,7 @@ template struct scalar_product_op, std::complex > // Quotient template struct scalar_quotient_op, const std::complex > : binary_op_base, const std::complex > { enum { - Vectorizable = packet_traits>::HasDiv + Vectorizable = packet_traits >::HasDiv }; typedef typename std::complex result_type; From 4343db84d8df4fbca74ff4321d0b84c1169f0628 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Mon, 1 May 2017 10:36:27 -0400 Subject: [PATCH 099/680] updated warning number for nvcc relase 8 (V8.0.61) for the stupid warning message 'calling a __host__ function from a __host__ __device__ function is not allowed'. --- Eigen/src/Core/util/DisableStupidWarnings.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 4431f2fc4..b91d1d1af 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -72,6 +72,7 @@ #pragma diag_suppress 2671 #pragma diag_suppress 2735 #pragma diag_suppress 2737 + #pragma diag_suppress 2739 #endif #endif // not EIGEN_WARNINGS_DISABLED From 052426b824038bbee1c1c48c3df65dfccd79ae24 Mon Sep 17 00:00:00 2001 From: a-doumoulakis Date: Fri, 5 May 2017 19:26:27 +0100 Subject: [PATCH 100/680] Add support for triSYCL Eigen is now able to use triSYCL with EIGEN_SYCL_TRISYCL and TRISYCL_INCLUDE_DIR options Fix contraction kernel with correct nd_item dimension --- CMakeLists.txt | 15 ++++- cmake/EigenTesting.cmake | 37 ++++++++---- unsupported/Eigen/CXX11/Tensor | 2 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 2 +- unsupported/test/CMakeLists.txt | 59 +++++++++++-------- 5 files changed, 72 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe4227cbb..54f3a7509 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,10 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wenum-conversion") ei_add_cxx_compiler_flag("-Wc++11-extensions") ei_add_cxx_compiler_flag("-Wdouble-promotion") + ei_add_cxx_compiler_flag("-Wno-unused-parameter") + ei_add_cxx_compiler_flag("-Wno-ignored-attributes") + ei_add_cxx_compiler_flag("-Wno-ignored-qualifiers") + ei_add_cxx_compiler_flag("-Wno-sign-compare") # ei_add_cxx_compiler_flag("-Wconversion") # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 @@ -437,10 +441,17 @@ endif() # add SYCL option(EIGEN_TEST_SYCL "Add Sycl support." OFF) +option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF) if(EIGEN_TEST_SYCL) set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}") - include(FindComputeCpp) -endif() + if(EIGEN_SYCL_TRISYCL) + message(STATUS "Using triSYCL") + include(FindTriSYCL) + else(EIGEN_SYCL_TRISYCL) + message(STATUS "Using ComputeCPP SYCL") + include(FindComputeCpp) + endif(EIGEN_SYCL_TRISYCL) +endif(EIGEN_TEST_SYCL) add_subdirectory(unsupported) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index a92a2978b..848233342 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -111,7 +111,7 @@ endmacro(ei_add_test_internal) # SYCL macro(ei_add_test_internal_sycl testname testname_with_suffix) - include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include) + set(targetname ${testname_with_suffix}) if(EIGEN_ADD_TEST_FILENAME_EXTENSION) @@ -124,18 +124,25 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix) set( bc_file ${CMAKE_CURRENT_BINARY_DIR}/${filename}) set( host_file ${CMAKE_CURRENT_SOURCE_DIR}/${filename}) - ADD_CUSTOM_COMMAND( - OUTPUT ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file} - DEPENDS ${filename} ${bc_file}.sycl - COMMENT "Building ComputeCpp integration header file ${include_file}" - ) - # Add a custom target for the generated integration header - add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file}) + if(NOT EIGEN_SYCL_TRISYCL) + include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include) + + ADD_CUSTOM_COMMAND( + OUTPUT ${include_file} + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file} + DEPENDS ${filename} ${bc_file}.sycl + COMMENT "Building ComputeCpp integration header file ${include_file}" + ) + + # Add a custom target for the generated integration header + add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file}) + add_executable(${targetname} ${include_file}) + add_dependencies(${targetname} ${testname}_integration_header_sycl) + else() + add_executable(${targetname} ${host_file}) + endif() - add_executable(${targetname} ${include_file}) - add_dependencies(${targetname} ${testname}_integration_header_sycl) add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR}) if (targetname MATCHES "^eigen2_") @@ -467,7 +474,11 @@ macro(ei_testing_print_summary) endif() if(EIGEN_TEST_SYCL) - message(STATUS "SYCL: ON") + if(EIGEN_SYCL_TRISYCL) + message(STATUS "SYCL: ON (using triSYCL)") + else() + message(STATUS "SYCL: ON (using computeCPP)") + endif() else() message(STATUS "SYCL: OFF") endif() diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 39916092b..5d71a9c25 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -19,7 +19,7 @@ #undef isnan #undef isinf #undef isfinite -#include +#include #include #include #include diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 5b4c3c5bd..e6840bc87 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -195,7 +195,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} - void operator()(cl::sycl::nd_item<1> itemID) { + void operator()(cl::sycl::nd_item<2> itemID) { typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type LHSDevExpr; typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type RHSDevExpr; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cdf151f15..9a9124640 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -152,33 +152,40 @@ endif() if(EIGEN_TEST_CXX11) if(EIGEN_TEST_SYCL) - ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_morphing_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_layout_swap_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_inflation_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_image_patch_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_volume_patcP_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") + if(EIGEN_SYCL_TRISYCL) + set(CMAKE_CXX_STANDARD 14) + set(STD_CXX_FLAG "-std=c++1z") + else(EIGEN_SYCL_TRISYCL) + # It should be safe to always run these tests as there is some fallback code for + # older compiler that don't support cxx11. + set(CMAKE_CXX_STANDARD 11) + set(STD_CXX_FLAG "-std=c++11") + endif(EIGEN_SYCL_TRISYCL) + + ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) + #ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) + #ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) + ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) endif(EIGEN_TEST_SYCL) - # It should be safe to always run these tests as there is some fallback code for - # older compiler that don't support cxx11. - set(CMAKE_CXX_STANDARD 11) ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") From a5226ce4f77f68a7f8590174ed3cdebb89cbe127 Mon Sep 17 00:00:00 2001 From: a-doumoulakis Date: Wed, 17 May 2017 17:59:30 +0100 Subject: [PATCH 101/680] Add cmake file FindTriSYCL.cmake --- cmake/FindTriSYCL.cmake | 152 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 cmake/FindTriSYCL.cmake diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake new file mode 100644 index 000000000..cb2154192 --- /dev/null +++ b/cmake/FindTriSYCL.cmake @@ -0,0 +1,152 @@ +#.rst: +# FindTriSYCL +#--------------- +# +# TODO : insert Copyright and licence + +######################### +# FindTriSYCL.cmake +######################### +# +# Tools for finding and building with TriSYCL. +# +# User must define TRISYCL_INCLUDE_DIR pointing to the triSYCL +# include directory. +# +# Latest version of this file can be found at: +# https://github.com/triSYCL/triSYCL + +# Requite CMake version 3.5 or higher +cmake_minimum_required (VERSION 3.5) + +# Check that a supported host compiler can be found +if(CMAKE_COMPILER_IS_GNUCXX) + # Require at least gcc 5.4 + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4) + message(FATAL_ERROR + "host compiler - Not found! (gcc version must be at least 5.4)") + else() + message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # Require at least clang 3.9 + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.9) + message(FATAL_ERROR + "host compiler - Not found! (clang version must be at least 3.9)") + else() + message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") + endif() +else() + message(WARNING + "host compiler - Not found! (triSYCL supports GCC and Clang)") +endif() + +#triSYCL options +option(TRISYCL_OPENMP "triSYCL multi-threading with OpenMP" ON) +option(TRISYCL_OPENCL "triSYCL OpenCL interoperability mode" OFF) +option(TRISYCL_NO_ASYNC "triSYCL use synchronous kernel execution" OFF) +option(TRISYCL_DEBUG "triSCYL use debug mode" OFF) +option(TRISYCL_DEBUG_STRUCTORS "triSYCL trace of object lifetimes" OFF) +option(TRISYCL_TRACE_KERNEL "triSYCL trace of kernel execution" OFF) + +mark_as_advanced(TRISYCL_OPENMP) +mark_as_advanced(TRISYCL_OPENCL) +mark_as_advanced(TRISYCL_NO_ASYNC) +mark_as_advanced(TRISYCL_DEBUG) +mark_as_advanced(TRISYCL_DEBUG_STRUCTORS) +mark_as_advanced(TRISYCL_TRACE_KERNEL) + +#triSYCL definitions +set(CL_SYCL_LANGUAGE_VERSION 220 CACHE VERSION + "Host language version to be used by trisYCL (default is: 220)") +set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE VERSION + "Device language version to be used by trisYCL (default is: 220)") +#set(TRISYCL_COMPILE_OPTIONS "-std=c++1z -Wall -Wextra") +set(CMAKE_CXX_STANDARD 14) +set(CXX_STANDARD_REQUIRED ON) + + +# Find OpenCL package +if(TRISYCL_OPENCL) + find_package(OpenCL REQUIRED) + if(UNIX) + set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH + "Path to Boost.Compute headers (default is: /usr/include/compute)") + endif(UNIX) +endif() + +# Find OpenMP package +if(TRISYCL_OPENMP) + find_package(OpenMP REQUIRED) +endif() + +# Find Boost +find_package(Boost 1.58 REQUIRED COMPONENTS chrono log) + +# If debug or trace we need boost log +if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL) + set(LOG_NEEDED ON) +else() + set(LOG_NEEDED OFF) +endif() + +find_package(Threads REQUIRED) + +# Find triSYCL directory +if(NOT TRISYCL_INCLUDE_DIR) + message(FATAL_ERROR + "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR") +else() + message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}") +endif() + +####################### +# add_sycl_to_target +####################### +# +# Sets the proper flags and includes for the target compilation. +# +# targetName : Name of the target to add a SYCL to. +# sourceFile : Source file to be compiled for SYCL. +# binaryDir : Intermediate directory to output the integration header. +# +function(add_sycl_to_target targetName sourceFile binaryDir) + + # Add include directories to the "#include <>" paths + target_include_directories (${targetName} PUBLIC + ${TRISYCL_INCLUDE_DIR} + ${Boost_INCLUDE_DIRS} + $<$:${OpenCL_INCLUDE_DIRS}> + $<$:${BOOST_COMPUTE_INCPATH}>) + + + # Link dependencies + target_link_libraries(${targetName} PUBLIC + $<$:${OpenCL_LIBRARIES}> + Threads::Threads + $<$:Boost::log> + Boost::chrono) + + + # Compile definitions + target_compile_definitions(${targetName} PUBLIC + $<$:TRISYCL_NO_ASYNC> + $<$:TRISYCL_OPENCL> + $<$:TRISYCL_DEBUG> + $<$:TRISYCL_DEBUG_STRUCTORS> + $<$:TRISYCL_TRACE_KERNEL> + $<$:BOOST_LOG_DYN_LINK>) + + # C++ and OpenMP requirements + target_compile_options(${targetName} PUBLIC + ${TRISYCL_COMPILE_OPTIONS} + $<$:${OpenMP_CXX_FLAGS}>) + + if(${TRISYCL_OPENMP} AND (NOT WIN32)) + # Does not support generator expressions + set_target_properties(${targetName} + PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) + endif(${TRISYCL_OPENMP} AND (NOT WIN32)) + +endfunction(add_sycl_to_target) From 61d7f3664a06c64e39a0bbccb610592bcaee92cd Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 22 May 2017 14:58:28 +0100 Subject: [PATCH 102/680] Fixing Cmake Dependency for SYCL --- cmake/EigenTesting.cmake | 18 +++++++++--------- cmake/FindComputeCpp.cmake | 8 +++++--- unsupported/test/CMakeLists.txt | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index a92a2978b..1fcdcc7bb 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -120,23 +120,23 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix) set(filename ${testname}.cpp) endif() - set( include_file ${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}) - set( bc_file ${CMAKE_CURRENT_BINARY_DIR}/${filename}) - set( host_file ${CMAKE_CURRENT_SOURCE_DIR}/${filename}) + set( include_file "${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}") + set( bc_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.sycl") + set( host_file "${CMAKE_CURRENT_SOURCE_DIR}/${filename}") ADD_CUSTOM_COMMAND( OUTPUT ${include_file} COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file} - DEPENDS ${filename} ${bc_file}.sycl + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}\\\"" >> ${include_file} + DEPENDS ${host_file} ${bc_file} COMMENT "Building ComputeCpp integration header file ${include_file}" ) # Add a custom target for the generated integration header - add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file}) + add_custom_target("${testname}_integration_header_sycl" DEPENDS ${include_file}) add_executable(${targetname} ${include_file}) - add_dependencies(${targetname} ${testname}_integration_header_sycl) - add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR}) + add_dependencies(${targetname} "${testname}_integration_header_sycl") + add_sycl_to_target(${targetname} ${CMAKE_CURRENT_BINARY_DIR} ${filename}) if (targetname MATCHES "^eigen2_") add_dependencies(eigen2_buildtests ${targetname}) @@ -720,4 +720,4 @@ macro(ei_test_get_compilerver_from_cxx_version_string) ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1") ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6") ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4") -endmacro(ei_test_get_compilerver_from_cxx_version_string) +endmacro(ei_test_get_compilerver_from_cxx_version_string) \ No newline at end of file diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 27e5c9b1f..90222ed0e 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -201,7 +201,8 @@ function(__build_spir targetName sourceFile binaryDir) ${device_compiler_includes} -o ${outputSyclFile} -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} - DEPENDS ${sourceFile} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} + IMPLICIT_DEPENDS CXX "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}" WORKING_DIRECTORY ${binaryDir} COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") @@ -233,8 +234,9 @@ endfunction() # sourceFile : Source file to be compiled for SYCL. # binaryDir : Intermediate directory to output the integration header. # -function(add_sycl_to_target targetName sourceFile binaryDir) +function(add_sycl_to_target targetName binaryDir sourceFile) + set(sourceFiles ${sourceFiles} ${ARGN}) # Add custom target to run compute++ and generate the integration header __build_spir(${targetName} ${sourceFile} ${binaryDir}) @@ -242,4 +244,4 @@ function(add_sycl_to_target targetName sourceFile binaryDir) target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} PUBLIC ${OpenCL_LIBRARIES}) -endfunction(add_sycl_to_target) +endfunction(add_sycl_to_target) \ No newline at end of file diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cdf151f15..196a432ff 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -172,7 +172,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_generator_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_image_patch_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_volume_patcP_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_volume_patch_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_argmax_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) From 2d17128d6f885ab839f2ecde360268f5b523426c Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 22 May 2017 16:40:33 +0100 Subject: [PATCH 103/680] Fixing suported device list. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index c5142b7c9..06df1030b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -81,28 +81,26 @@ struct memsetCghFunctor{ } }; - //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) +//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ - auto devices = cl::sycl::device::get_devices(); - std::vector::iterator it =devices.begin(); - while(it!=devices.end()) { - ///FIXME: Currently there is a bug in amd cpu OpenCL - auto name = (*it).template get_info(); - std::transform(name.begin(), name.end(), name.begin(), ::tolower); - auto vendor = (*it).template get_info(); +std::vector supported_devices; +auto plafrom_list =cl::sycl::platform::get_platforms(); +for(const auto& platform : plafrom_list){ + auto device_list = platform.get_devices(); + auto platform_name =platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower); + for(const auto& device : device_list){ + auto vendor = device.template get_info(); std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); - - if((*it).is_cpu() && vendor.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it = devices.erase(it); - //FIXME: currently there is a bug in intel gpu driver regarding memory allignment issue. - }else if((*it).is_gpu() && name.find("intel")!=std::string::npos){ - it = devices.erase(it); - } - else{ - ++it; + bool unsuported_condition = (device.is_cpu() && platform_name.find("amd")!=std::string::npos && vendor.find("apu") == std::string::npos) || + (device.is_gpu() && platform_name.find("intel")!=std::string::npos); + if(!unsuported_condition){ + std::cout << "Platform name "<< platform_name << std::endl; + supported_devices.push_back(device); } } - return devices; +} +return supported_devices; } class QueueInterface { From 76c0fc1f955eda3d243db8960cb6fee9a5305112 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 22 May 2017 16:49:32 +0100 Subject: [PATCH 104/680] Fixing SYCL alignment issue required by TensorFlow. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index c5142b7c9..627c0ab19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,7 +14,23 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +template struct CheckAlignStatically{ + static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *))); +}; +template +struct Conditional_Allocate{ +EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ + return aligned_alloc(Align, elements); +} +}; +template +struct Conditional_Allocate{ + +EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ + return malloc(elements); +} +}; template > struct SyclAllocator { typedef Scalar value_type; @@ -22,7 +38,10 @@ struct SyclAllocator { typedef typename std::allocator_traits::size_type size_type; SyclAllocator( ){}; - Scalar* allocate(std::size_t elements) { return static_cast(aligned_alloc(Align, elements)); } + Scalar* allocate(std::size_t elements) { + return static_cast(Conditional_Allocate::Val, Align>::conditional_allocate(elements)); + // return static_cast(aligned_alloc(Align, elements)); + } void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } }; @@ -533,4 +552,4 @@ struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H \ No newline at end of file From b42d775f1350838234f9f05bf64a4f7c12282218 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 23 May 2017 10:51:14 +0100 Subject: [PATCH 105/680] Temporarry branch for synch with upstream From 8508db52ab832bfd52ac314c862b7beeb64dc182 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 6 Jun 2017 17:25:56 +0200 Subject: [PATCH 106/680] bug #1417: make LinSpace compatible with std::complex --- Eigen/src/Core/functors/NullaryFunctors.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index 6a30466fb..b03be0269 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -44,16 +44,16 @@ struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), - m_interPacket(plset(0)), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { + typedef typename NumTraits::Real RealScalar; if(m_flip) - return (i==0)? m_low : (m_high - (m_size1-i)*m_step); + return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : (m_low + i*m_step); + return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); } template @@ -63,7 +63,7 @@ struct linspaced_op_impl // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) if(m_flip) { - Packet pi = padd(pset1(Scalar(i-m_size1)),m_interPacket); + Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); if(i==0) res = pinsertfirst(res, m_low); @@ -71,7 +71,7 @@ struct linspaced_op_impl } else { - Packet pi = padd(pset1(Scalar(i)),m_interPacket); + Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); if(i==m_size1-unpacket_traits::size+1) res = pinsertlast(res, m_high); @@ -83,7 +83,6 @@ struct linspaced_op_impl const Scalar m_high; const Index m_size1; const Scalar m_step; - const Packet m_interPacket; const bool m_flip; }; From e018142604b7f138d351dfc3c34890159cbe5fdc Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 6 Jun 2017 19:23:14 +0200 Subject: [PATCH 107/680] Make sure CholmodSupport works when included in multiple compilation units (issue was reported on stackoverflow.com) --- Eigen/src/CholmodSupport/CholmodSupport.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index 61faf43ba..dc199ece6 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -167,12 +167,12 @@ namespace internal { // template specializations for int and long that call the correct cholmod method #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ - template ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ - template<> ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } + template inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ + template<> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ - template ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ - template<> ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } + template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ + template<> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } EIGEN_CHOLMOD_SPECIALIZE0(int, start) EIGEN_CHOLMOD_SPECIALIZE0(int, finish) @@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) -template cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } -template<> cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } +template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } +template<> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } -template cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } -template<> cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } +template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } +template<> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } template -int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } template<> -int cm_factorize_p (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } #undef EIGEN_CHOLMOD_SPECIALIZE0 #undef EIGEN_CHOLMOD_SPECIALIZE1 From f2a553fb7b5dd2b8bf7b35af8fc9485f25b778b5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 7 Jun 2017 10:10:30 +0200 Subject: [PATCH 108/680] bug #1411: fix usage of alignment information in vectorization of quaternion product and conjugate. --- Eigen/src/Geometry/Quaternion.h | 10 ++--- Eigen/src/Geometry/arch/Geometry_SSE.h | 60 +++++++++++++++++--------- 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index f6ef1bcf6..3e5a9badb 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -423,7 +423,7 @@ typedef Map, Aligned> QuaternionMapAlignedd; // Generic Quaternion * Quaternion product // This product can be specialized for a given architecture via the Arch template argument. namespace internal { -template struct quat_product +template struct quat_product { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& a, const QuaternionBase& b){ return Quaternion @@ -446,8 +446,7 @@ QuaternionBase::operator* (const QuaternionBase& other) c EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) return internal::quat_product::Scalar, - EIGEN_PLAIN_ENUM_MIN(internal::traits::Alignment, internal::traits::Alignment)>::run(*this, other); + typename internal::traits::Scalar>::run(*this, other); } /** \sa operator*(Quaternion) */ @@ -672,7 +671,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> // Generic conjugate of a Quaternion namespace internal { -template struct quat_conj +template struct quat_conj { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& q){ return Quaternion(q.w(),-q.x(),-q.y(),-q.z()); @@ -691,8 +690,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> QuaternionBase::conjugate() const { return internal::quat_conj::Scalar, - internal::traits::Alignment>::run(*this); + typename internal::traits::Scalar>::run(*this); } diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index 1a86ff837..f68cab583 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -16,17 +16,23 @@ namespace Eigen { namespace internal { template -struct quat_product +struct quat_product { + enum { + AAlignment = traits::Alignment, + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { Quaternion res; const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); + __m128 a = _a.coeffs().template packet(0); + __m128 b = _b.coeffs().template packet(0); __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstore(&res.x(), + pstoret( + &res.x(), _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), @@ -36,14 +42,17 @@ struct quat_product } }; -template -struct quat_conj +template +struct quat_conj { + enum { + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& q) { Quaternion res; const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet(0))); + pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); return res; } }; @@ -52,6 +61,9 @@ struct quat_conj template struct cross3_impl { + enum { + ResAlignment = traits::type>::Alignment + }; static inline typename plain_matrix_type::type run(const VectorLhs& lhs, const VectorRhs& rhs) { @@ -60,7 +72,7 @@ struct cross3_impl __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); typename plain_matrix_type::type res; - pstore(&res.x(),_mm_sub_ps(mul1,mul2)); + pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); return res; } }; @@ -68,9 +80,14 @@ struct cross3_impl -template -struct quat_product +template +struct quat_product { + enum { + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); @@ -78,8 +95,8 @@ struct quat_product Quaternion res; const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); + Packet2d b_xy = _b.coeffs().template packet(0); + Packet2d b_zw = _b.coeffs().template packet(2); Packet2d a_xx = pset1(a[0]); Packet2d a_yy = pset1(a[1]); Packet2d a_zz = pset1(a[2]); @@ -97,9 +114,9 @@ struct quat_product t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); #ifdef EIGEN_VECTORIZE_SSE3 EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2))); + pstoret(&res.x(), _mm_addsub_pd(t1, preverse(t2))); #else - pstore(&res.x(), padd(t1, pxor(mask,preverse(t2)))); + pstoret(&res.x(), padd(t1, pxor(mask,preverse(t2)))); #endif /* @@ -111,25 +128,28 @@ struct quat_product t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); #ifdef EIGEN_VECTORIZE_SSE3 EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); + pstoret(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); #else - pstore(&res.z(), psub(t1, pxor(mask,preverse(t2)))); + pstoret(&res.z(), psub(t1, pxor(mask,preverse(t2)))); #endif return res; } }; -template -struct quat_conj +template +struct quat_conj { + enum { + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& q) { Quaternion res; const __m128d mask0 = _mm_setr_pd(-0.,-0.); const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet(0))); - pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet(2))); + pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); + pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); return res; } }; From 26e8f9171eb07e188af45d7583b03c5be40a5f48 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 7 Jun 2017 10:51:23 +0200 Subject: [PATCH 109/680] Fix compilation of matrix log with Map as input --- .../Eigen/src/MatrixFunctions/MatrixFunction.h | 11 +++++------ .../Eigen/src/MatrixFunctions/MatrixLogarithm.h | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index db2449d02..3f7d77710 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -398,8 +398,8 @@ struct matrix_function_compute template struct matrix_function_compute { - template - static void run(const MatrixType& A, AtomicType& atomic, ResultType &result) + template + static void run(const MatA& A, AtomicType& atomic, ResultType &result) { typedef internal::traits Traits; typedef typename Traits::Scalar Scalar; @@ -422,11 +422,10 @@ struct matrix_function_compute template struct matrix_function_compute { - template - static void run(const MatrixType& A, AtomicType& atomic, ResultType &result) + template + static void run(const MatA& A, AtomicType& atomic, ResultType &result) { typedef internal::traits Traits; - typedef typename MatrixType::Index Index; // compute Schur decomposition of A const ComplexSchur schurOfA(A); @@ -514,7 +513,7 @@ template class MatrixFunctionReturnValue typedef internal::MatrixFunctionAtomic AtomicType; AtomicType atomic(m_f); - internal::matrix_function_compute::run(m_A, atomic, result); + internal::matrix_function_compute::run(m_A, atomic, result); } Index rows() const { return m_A.rows(); } diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 1acfbed9e..ff8f6e732 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -339,7 +339,7 @@ public: typedef internal::MatrixLogarithmAtomic AtomicType; AtomicType atomic; - internal::matrix_function_compute::run(m_A, atomic, result); + internal::matrix_function_compute::run(m_A, atomic, result); } Index rows() const { return m_A.rows(); } From 2971503fed85add086cb163d24ee5d402d631aab Mon Sep 17 00:00:00 2001 From: Mmanu Chaturvedi Date: Tue, 23 May 2017 17:12:36 -0400 Subject: [PATCH 110/680] Specializing numeric_limits For AutoDiffScalar --- unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h | 7 +++++++ unsupported/test/autodiff_scalar.cpp | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index d2808860c..279fe5cd3 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -683,4 +683,11 @@ template struct NumTraits > } +namespace std { +template +class numeric_limits > + : public numeric_limits {}; + +} // namespace std + #endif // EIGEN_AUTODIFF_SCALAR_H diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index 4df2f5c57..3a78d6b8d 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -72,6 +72,19 @@ template void check_hyperbolic_functions() VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150)); } +template +void check_limits_specialization() +{ + typedef Eigen::Matrix Deriv; + typedef Eigen::AutoDiffScalar AD; + + typedef std::numeric_limits A; + typedef std::numeric_limits B; + + bool res = std::is_base_of::value; + VERIFY_IS_EQUAL(res, true); +} + void test_autodiff_scalar() { for(int i = 0; i < g_repeat; i++) { @@ -79,5 +92,6 @@ void test_autodiff_scalar() CALL_SUBTEST_2( check_atan2() ); CALL_SUBTEST_3( check_hyperbolic_functions() ); CALL_SUBTEST_4( check_hyperbolic_functions() ); + CALL_SUBTEST_5( check_limits_specialization()); } } From 0cb3c7c7dd46d318921b5e140afc73fded87b068 Mon Sep 17 00:00:00 2001 From: Duncan McBain Date: Wed, 24 May 2017 12:24:21 +0100 Subject: [PATCH 111/680] Update FindComputeCpp.cmake with new changes from SDK --- cmake/FindComputeCpp.cmake | 49 ++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 90222ed0e..e61dedc46 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -38,11 +38,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) message(FATAL_ERROR "host compiler - Not found! (gcc version must be at least 4.8)") - # Require the GCC dual ABI to be disabled for 5.1 or higher - elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1) - set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True") - message(STATUS - "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)") else() message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") endif() @@ -64,6 +59,12 @@ option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode" ${COMPUTECPP_64_BIT_DEFAULT}) mark_as_advanced(COMPUTECPP_64_BIT_CODE) +option(COMPUTECPP_DISABLE_GCC_DUAL_ABI "Compile with pre-5.1 ABI" OFF) +mark_as_advanced(COMPUTECPP_DISABLE_GCC_DUAL_ABI) + +set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++") +mark_as_advanced(COMPUTECPP_USER_FLAGS) + # Find OpenCL package find_package(OpenCL REQUIRED) @@ -74,7 +75,6 @@ if(NOT COMPUTECPP_PACKAGE_ROOT_DIR) else() message(STATUS "ComputeCpp package - Found") endif() -option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package") # Obtain the path to compute++ find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS @@ -138,8 +138,6 @@ else() message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") endif() -set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) - # Check if the platform is supported execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED @@ -155,6 +153,13 @@ else() endif() endif() +set(COMPUTECPP_USER_FLAGS + -sycl-compress-name + -Wall + -no-serial-memop + -DEIGEN_NO_ASSERTION_CHECKING=1 + ) + #################### # __build_sycl #################### @@ -165,8 +170,11 @@ endif() # targetName : Name of the target. # sourceFile : Source file to be compiled. # binaryDir : Intermediate directory to output the integration header. +# fileCounter : Counter included in name of custom target. Different counter +# values prevent duplicated names of custom target when source files with the same name, +# but located in different directories, are used for the same target. # -function(__build_spir targetName sourceFile binaryDir) +function(__build_spir targetName sourceFile binaryDir fileCounter) # Retrieve source file name. get_filename_component(sourceFileName ${sourceFile} NAME) @@ -175,12 +183,16 @@ function(__build_spir targetName sourceFile binaryDir) set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl) # Add any user-defined include to the device compiler + set(device_compiler_includes "") get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) - set(device_compiler_includes "") foreach(directory ${includeDirectories}) set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) endforeach() + get_target_property(targetIncludeDirectories ${targetName} INCLUDE_DIRECTORIES) + foreach(directory ${targetIncludeDirectories}) + set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) + endforeach() if (CMAKE_INCLUDE_PATH) foreach(directory ${CMAKE_INCLUDE_PATH}) set(device_compiler_includes "-I${directory}" @@ -188,6 +200,9 @@ function(__build_spir targetName sourceFile binaryDir) endforeach() endif() + set(COMPUTECPP_DEVICE_COMPILER_FLAGS + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + ${COMPUTECPP_USER_FLAGS}) # Convert argument list format separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS) @@ -204,7 +219,7 @@ function(__build_spir targetName sourceFile binaryDir) DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} IMPLICIT_DEPENDS CXX "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}" WORKING_DIRECTORY ${binaryDir} - COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") + COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") # Add a custom target for the generated integration header add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile}) @@ -231,17 +246,21 @@ endfunction() # target and sets a dependancy on that new command. # # targetName : Name of the target to add a SYCL to. -# sourceFile : Source file to be compiled for SYCL. # binaryDir : Intermediate directory to output the integration header. +# sourceFiles : Source files to be compiled for SYCL. # -function(add_sycl_to_target targetName binaryDir sourceFile) +function(add_sycl_to_target targetName binaryDir sourceFiles) set(sourceFiles ${sourceFiles} ${ARGN}) + set(fileCounter 0) # Add custom target to run compute++ and generate the integration header - __build_spir(${targetName} ${sourceFile} ${binaryDir}) + foreach(sourceFile ${sourceFiles}) + __build_spir(${targetName} ${sourceFile} ${binaryDir} ${fileCounter}) + MATH(EXPR fileCounter "${fileCounter} + 1") + endforeach() # Link with the ComputeCpp runtime library target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} PUBLIC ${OpenCL_LIBRARIES}) -endfunction(add_sycl_to_target) \ No newline at end of file +endfunction(add_sycl_to_target) From 9ef5c948ba0ae4e24941b1319aac0b7584f10795 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 24 May 2017 13:11:16 +0100 Subject: [PATCH 112/680] Fixing Cmake for gcc>=5. From fb853a857a5d3e06687c6736084c20e984b7347d Mon Sep 17 00:00:00 2001 From: a-doumoulakis Date: Wed, 24 May 2017 17:50:15 +0100 Subject: [PATCH 113/680] Restore misplaced comment --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 3105cc122..06df1030b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -81,7 +81,7 @@ struct memsetCghFunctor{ } }; - //get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) +//get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU and intel GPU) EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ std::vector supported_devices; auto plafrom_list =cl::sycl::platform::get_platforms(); From e3f964ed55a96d0c94814d07ab88d8805e0c2eec Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 25 May 2017 11:17:26 +0100 Subject: [PATCH 114/680] Applying Benoit's comment;removing dead code. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 627c0ab19..27c10a0cf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -40,7 +40,6 @@ struct SyclAllocator { SyclAllocator( ){}; Scalar* allocate(std::size_t elements) { return static_cast(Conditional_Allocate::Val, Align>::conditional_allocate(elements)); - // return static_cast(aligned_alloc(Align, elements)); } void deallocate(Scalar * p, std::size_t size) { EIGEN_UNUSED_VARIABLE(size); free(p); } }; From c3bd860de8f1d473a05072917f56b380ba59533b Mon Sep 17 00:00:00 2001 From: a-doumoulakis Date: Thu, 25 May 2017 18:46:18 +0100 Subject: [PATCH 115/680] Modification upon request - Remove warning suppression --- CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 54f3a7509..2e1648fd1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,10 +151,6 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wenum-conversion") ei_add_cxx_compiler_flag("-Wc++11-extensions") ei_add_cxx_compiler_flag("-Wdouble-promotion") - ei_add_cxx_compiler_flag("-Wno-unused-parameter") - ei_add_cxx_compiler_flag("-Wno-ignored-attributes") - ei_add_cxx_compiler_flag("-Wno-ignored-qualifiers") - ei_add_cxx_compiler_flag("-Wno-sign-compare") # ei_add_cxx_compiler_flag("-Wconversion") # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 From 0370d3576e1df4f898b9030fbc269ec0488d3969 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 26 May 2017 16:01:48 +0100 Subject: [PATCH 116/680] Applying Ronnan's comments. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 27c10a0cf..09c96317e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,22 +14,22 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -template struct CheckAlignStatically{ +template struct CheckAlignStatically { static const bool Val= (((Align&(Align-1))==0) && (Align >= sizeof(void *))); }; template -struct Conditional_Allocate{ +struct Conditional_Allocate { -EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ - return aligned_alloc(Align, elements); -} + EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements) { + return aligned_alloc(Align, elements); + } }; template -struct Conditional_Allocate{ +struct Conditional_Allocate { -EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ - return malloc(elements); -} + EIGEN_ALWAYS_INLINE static void* conditional_allocate(std::size_t elements){ + return malloc(elements); + } }; template > struct SyclAllocator { @@ -551,4 +551,4 @@ struct SyclKernelDevice:DefaultDevice{}; } // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H \ No newline at end of file +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H From 9341f258d4ee8a819c31cec8a9dc027a10669372 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Tue, 6 Jun 2017 15:51:06 +0100 Subject: [PATCH 117/680] Add labels to #ifdef, in TensorReductionCuda.h --- .../CXX11/src/Tensor/TensorReductionCuda.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index edb0ab280..24a55a3d5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) else { assert(0 && "Wordsize not supported"); } -#else +#else // __CUDA_ARCH__ >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // __CUDA_ARCH__ >= 300 } // We extend atomicExch to support extra data types @@ -98,15 +98,15 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer } } } -#endif +#endif // EIGEN_HAS_CUDA_FP16 template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer&) { #if __CUDA_ARCH__ >= 300 atomicAdd(output, accum); -#else +#else // __CUDA_ARCH__ >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // __CUDA_ARCH__ >= 300 } @@ -179,9 +179,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num // Let the last block reset the semaphore atomicInc(semaphore, gridDim.x + 1); } -#else +#else // __CUDA_ARCH__ >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // __CUDA_ARCH__ >= 300 } @@ -268,7 +268,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 *output = tmp; } -#endif +#endif // EIGEN_HAS_CUDA_FP16 template struct FullReductionLauncher { @@ -335,7 +335,7 @@ struct FullReductionLauncher { } } }; -#endif +#endif // EIGEN_HAS_CUDA_FP16 template @@ -348,11 +348,11 @@ struct FullReducer { (internal::is_same::value || internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); -#else +#else // EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || internal::is_same::value); -#endif +#endif // EIGEN_HAS_CUDA_FP16 template static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { @@ -433,9 +433,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } } -#else +#else // __CUDA_ARCH__ >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif +#endif // __CUDA_ARCH__ >= 300 } #ifdef EIGEN_HAS_CUDA_FP16 @@ -533,7 +533,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, } } -#endif +#endif // EIGEN_HAS_CUDA_FP16 template struct InnerReductionLauncher { @@ -625,7 +625,7 @@ struct InnerReductionLauncher { return false; } }; -#endif +#endif // EIGEN_HAS_CUDA_FP16 template @@ -638,11 +638,11 @@ struct InnerReducer { (internal::is_same::value || internal::is_same::value || (internal::is_same::value && reducer_traits::PacketAccess)); -#else +#else // EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same::value || internal::is_same::value); -#endif +#endif // EIGEN_HAS_CUDA_FP16 template static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { @@ -740,7 +740,7 @@ struct OuterReducer { } }; -#endif +#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__) } // end namespace internal From 4bbc32046810f65bb0f77f6dbe538abad51de281 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 8 Jun 2017 12:55:25 +0200 Subject: [PATCH 118/680] bug #1435: fix aliasing issue in exressions like: A = C - B*A; --- Eigen/src/Core/ProductEvaluators.h | 6 ++++++ test/product_notemporary.cpp | 1 + 2 files changed, 7 insertions(+) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 583b7f59e..c42725dbd 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -207,6 +207,12 @@ struct evaluator_assume_aliasing +struct evaluator_assume_aliasing::Scalar>, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; + template struct assignment_from_xpr_op_product { diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index 8bf71b4f2..30592b79e 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -51,6 +51,7 @@ template void product_notemporary(const MatrixType& m) VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0); VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()), 1); + VERIFY_EVALUATION_COUNT( m3 = m3 - (m1 * m2.adjoint()), 1); VERIFY_EVALUATION_COUNT( m3 = m3 + (m1 * m2.adjoint()).transpose(), 1); VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0); From 682b2ef17ee360ac11cebbe7286dc4edd9accfa3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 8 Jun 2017 15:06:27 +0200 Subject: [PATCH 119/680] bug #1423: fix LSCG\'s Jacobi preconditioner for row-major matrices. --- .../BasicPreconditioners.h | 27 ++++++++++++++----- test/lscg.cpp | 8 ++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 358444aff..279c9173c 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -152,13 +152,28 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { // Compute the inverse squared-norm of each column of mat m_invdiag.resize(mat.cols()); - for(Index j=0; j0) - m_invdiag(j) = RealScalar(1)/sum; - else - m_invdiag(j) = RealScalar(1); + m_invdiag.setZero(); + for(Index j=0; j0) + m_invdiag(j) = RealScalar(1)/m_invdiag(j); + } + else + { + for(Index j=0; j0) + m_invdiag(j) = RealScalar(1)/sum; + else + m_invdiag(j) = RealScalar(1); + } } Base::m_isInitialized = true; return *this; diff --git a/test/lscg.cpp b/test/lscg.cpp index daa62a954..d49ee00c3 100644 --- a/test/lscg.cpp +++ b/test/lscg.cpp @@ -14,12 +14,20 @@ template void test_lscg_T() { LeastSquaresConjugateGradient > lscg_colmajor_diag; LeastSquaresConjugateGradient, IdentityPreconditioner> lscg_colmajor_I; + LeastSquaresConjugateGradient > lscg_rowmajor_diag; + LeastSquaresConjugateGradient, IdentityPreconditioner> lscg_rowmajor_I; CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag) ); CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I) ); CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag) ); CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I) ); + + CALL_SUBTEST( check_sparse_square_solving(lscg_rowmajor_diag) ); + CALL_SUBTEST( check_sparse_square_solving(lscg_rowmajor_I) ); + + CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_diag) ); + CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_I) ); } void test_lscg() From d58882277999508b1b97624771433422897e9aee Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 11:51:53 +0200 Subject: [PATCH 120/680] Add missing std::numeric_limits specialization for half, and complete NumTraits --- Eigen/src/Core/arch/CUDA/Half.h | 50 +++++++++++++++++++++++++++++++++ test/half_float.cpp | 22 +++++++++++---- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 67518da9f..5a9bb52e6 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -493,9 +493,59 @@ template<> struct is_arithmetic { enum { value = true }; }; } // end namespace internal +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 2; + //static const int max_digits10 = ; + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return epsilon()/2; } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; +} + +namespace Eigen { + template<> struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { return half_impl::raw_uint16_to_half(0x0800); } diff --git a/test/half_float.cpp b/test/half_float.cpp index 6f3196852..7ec40f3ae 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -96,12 +96,24 @@ void test_conversion() void test_numtraits() { - std::cout << "epsilon = " << NumTraits::epsilon() << std::endl; - std::cout << "highest = " << NumTraits::highest() << std::endl; - std::cout << "lowest = " << NumTraits::lowest() << std::endl; - std::cout << "inifinty = " << NumTraits::infinity() << std::endl; - std::cout << "nan = " << NumTraits::quiet_NaN() << std::endl; + std::cout << "epsilon = " << NumTraits::epsilon() << " (0x" << std::hex << NumTraits::epsilon().x << ")" << std::endl; + std::cout << "highest = " << NumTraits::highest() << " (0x" << std::hex << NumTraits::highest().x << ")" << std::endl; + std::cout << "lowest = " << NumTraits::lowest() << " (0x" << std::hex << NumTraits::lowest().x << ")" << std::endl; + std::cout << "min = " << (std::numeric_limits::min)() << " (0x" << std::hex << half((std::numeric_limits::min)()).x << ")" << std::endl; + std::cout << "denorm min = " << (std::numeric_limits::denorm_min)() << " (0x" << std::hex << half((std::numeric_limits::denorm_min)()).x << ")" << std::endl; + std::cout << "infinity = " << NumTraits::infinity() << " (0x" << std::hex << NumTraits::infinity().x << ")" << std::endl; + std::cout << "quiet nan = " << NumTraits::quiet_NaN() << " (0x" << std::hex << NumTraits::quiet_NaN().x << ")" << std::endl; + std::cout << "signaling nan = " << std::numeric_limits::signaling_NaN() << " (0x" << std::hex << std::numeric_limits::signaling_NaN().x << ")" << std::endl; + VERIFY(NumTraits::IsSigned); + + VERIFY_IS_EQUAL( std::numeric_limits::infinity().x, half(std::numeric_limits::infinity()).x ); + VERIFY_IS_EQUAL( std::numeric_limits::quiet_NaN().x, half(std::numeric_limits::quiet_NaN()).x ); + VERIFY_IS_EQUAL( std::numeric_limits::signaling_NaN().x, half(std::numeric_limits::signaling_NaN()).x ); + VERIFY( (std::numeric_limits::min)() > half(0.f) ); + VERIFY( (std::numeric_limits::denorm_min)() > half(0.f) ); + VERIFY( (std::numeric_limits::min)()/2 > half(0.f) ); + VERIFY_IS_EQUAL( (std::numeric_limits::denorm_min)()/2, half(0.f) ); } void test_arithmetic() From 498aa95a8b2b505c919efff8857a6fd2a5bbc172 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 11:53:49 +0200 Subject: [PATCH 121/680] bug #1424: add numext::abs specialization for unsigned integer types. --- Eigen/src/Core/MathFunctions.h | 10 ++++++- test/CMakeLists.txt | 1 + test/main.h | 11 +++++++ test/numext.cpp | 53 ++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 test/numext.cpp diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 5ec6c395e..0be4a25da 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1232,11 +1232,19 @@ double log(const double &x) { return ::log(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits::Real abs(const T &x) { +typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +abs(const T &x) { EIGEN_USING_STD_MATH(abs); return abs(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +abs(const T &x) { + return x; +} + #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8da51ce57..e73ab92b4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -148,6 +148,7 @@ add_custom_target(BuildOfficial) ei_add_test(rand) ei_add_test(meta) +ei_add_test(numext) ei_add_test(sizeof) ei_add_test(dynalloc) ei_add_test(nomalloc) diff --git a/test/main.h b/test/main.h index 25d2dcf43..bd5325196 100644 --- a/test/main.h +++ b/test/main.h @@ -310,6 +310,17 @@ template<> inline float test_precision >() { return test_pre template<> inline double test_precision >() { return test_precision(); } template<> inline long double test_precision >() { return test_precision(); } +inline bool test_isApprox(const short& a, const short& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned short& a, const unsigned short& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned int& a, const unsigned int& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const long& a, const long& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned long& a, const unsigned long& b) +{ return internal::isApprox(a, b, test_precision()); } + inline bool test_isApprox(const int& a, const int& b) { return internal::isApprox(a, b, test_precision()); } inline bool test_isMuchSmallerThan(const int& a, const int& b) diff --git a/test/numext.cpp b/test/numext.cpp new file mode 100644 index 000000000..3de33e2f9 --- /dev/null +++ b/test/numext.cpp @@ -0,0 +1,53 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template +void check_abs() { + typedef typename NumTraits::Real Real; + + if(NumTraits::IsSigned) + VERIFY_IS_EQUAL(numext::abs(-T(1)), T(1)); + VERIFY_IS_EQUAL(numext::abs(T(0)), T(0)); + VERIFY_IS_EQUAL(numext::abs(T(1)), T(1)); + + for(int k=0; k(); + if(!internal::is_same::value) + x = x/Real(2); + if(NumTraits::IsSigned) + { + VERIFY_IS_EQUAL(numext::abs(x), numext::abs(-x)); + VERIFY( numext::abs(-x) >= Real(0)); + } + VERIFY( numext::abs(x) >= Real(0)); + VERIFY_IS_APPROX( numext::abs2(x), numext::abs2(numext::abs(x)) ); + } +} + +void test_numext() { + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + CALL_SUBTEST( check_abs() ); + + CALL_SUBTEST( check_abs >() ); + CALL_SUBTEST( check_abs >() ); +} From a7be4cd1b1ea4d85165f003d793f1d46b199b7bd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 11:57:53 +0200 Subject: [PATCH 122/680] Fix LeastSquareDiagonalPreconditioner for complexes (issue introduced in previous commit) --- Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 279c9173c..facdaf890 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -158,18 +158,18 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> for(Index j=0; j0) - m_invdiag(j) = RealScalar(1)/m_invdiag(j); + if(numext::real(m_invdiag(j))>RealScalar(0)) + m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j)); } else { for(Index j=0; j0) + if(sum>RealScalar(0)) m_invdiag(j) = RealScalar(1)/sum; else m_invdiag(j) = RealScalar(1); From 8640093af1f66dd6c390420ee6919f7ba0c89e0c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 12:45:01 +0200 Subject: [PATCH 123/680] fix compilation in C++98 --- unsupported/test/autodiff_scalar.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index 3a78d6b8d..9cf11280c 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -81,8 +81,9 @@ void check_limits_specialization() typedef std::numeric_limits A; typedef std::numeric_limits B; - bool res = std::is_base_of::value; - VERIFY_IS_EQUAL(res, true); +#if EIGEN_HAS_CXX11 + VERIFY(bool(std::is_base_of::value)); +#endif } void test_autodiff_scalar() From 723a59ac26d28e42ad07221dc30b75ea8049ce1f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 12:54:40 +0200 Subject: [PATCH 124/680] add regression test for aliasing in product rewritting --- test/product.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/product.h b/test/product.h index 3b6511270..0425a929e 100644 --- a/test/product.h +++ b/test/product.h @@ -216,6 +216,8 @@ template void product(const MatrixType& m) // CwiseBinaryOp VERIFY_IS_APPROX(x = y + A*x, A*z); x = z; + VERIFY_IS_APPROX(x = y - A*x, A*(-z)); + x = z; // CwiseUnaryOp VERIFY_IS_APPROX(x = Scalar(1.)*(A*x), A*z); } From fb1ee04087f6d5cfe9009941c1a1eb25b3133a04 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 13:13:03 +0200 Subject: [PATCH 125/680] bug #1410: fix lvalue propagation of Array/Matrix-Wrapper with a const nested expression. --- Eigen/src/Core/ArrayWrapper.h | 6 ++++-- test/array_for_matrix.cpp | 22 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index a04521a16..688aadd62 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -32,7 +32,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } @@ -129,7 +130,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index c1501947b..b8721391f 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -235,12 +235,31 @@ template void resize(const MatrixTraits& t) VERIFY(a1.size()==cols); } +template void regression_bug_654() { ArrayXf a = RowVectorXf(3); VectorXf v = Array(3); } +// Check propagation of LvalueBit through Array/Matrix-Wrapper +template +void regrrssion_bug_1410() +{ + const Matrix4i M; + const Array4i A; + ArrayWrapper MA = M.array(); + MA.row(0); + MatrixWrapper AM = A.matrix(); + AM.row(0); + + VERIFY((internal::traits >::Flags&LvalueBit)==0); + VERIFY((internal::traits >::Flags&LvalueBit)==0); + + VERIFY((internal::traits >::Flags&LvalueBit)==LvalueBit); + VERIFY((internal::traits >::Flags&LvalueBit)==LvalueBit); +} + void test_array_for_matrix() { for(int i = 0; i < g_repeat; i++) { @@ -280,5 +299,6 @@ void test_array_for_matrix() CALL_SUBTEST_5( resize(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( resize(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } - CALL_SUBTEST_6( regression_bug_654() ); + CALL_SUBTEST_6( regression_bug_654<0>() ); + CALL_SUBTEST_6( regrrssion_bug_1410<0>() ); } From 1d59ca245811b5becc24bc89510b3d359f3a2dc1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 13:20:52 +0200 Subject: [PATCH 126/680] Fix compilation with gcc 4.3 and ARM NEON --- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index aede4a6d5..836fbc0dd 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -116,7 +116,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - const float32_t f[] = {0, 1, 2, 3}; + const float f[] = {0, 1, 2, 3}; Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1(a), countdown); } From 26f552c18de48fa04f2a9a9a27540a7679328254 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 13:36:58 +0200 Subject: [PATCH 127/680] fix compilation of Half in C++98 (issue introduced in previous commit) --- Eigen/src/Core/arch/CUDA/Half.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 5a9bb52e6..e4e639fcd 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -507,7 +507,7 @@ struct numeric_limits { static const bool has_signaling_NaN = true; static const float_denorm_style has_denorm = denorm_present; static const bool has_denorm_loss = false; - std::float_round_style round_style = std::round_to_nearest; + static const std::float_round_style round_style = std::round_to_nearest; static const bool is_iec559 = false; static const bool is_bounded = false; static const bool is_modulo = false; @@ -526,7 +526,7 @@ struct numeric_limits { static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } - static Eigen::half round_error() { return epsilon()/2; } + static Eigen::half round_error() { return Eigen::half(0.5); } static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } From 90168c003d80236e935589d56f9fbd8e9c7aaa75 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 14:01:44 +0200 Subject: [PATCH 128/680] bug #1414: doxygen, add EigenBase to CoreModule --- Eigen/src/Core/EigenBase.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index ccc122cfc..b195506a9 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -14,6 +14,7 @@ namespace Eigen { /** \class EigenBase + * \ingroup Core_Module * * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * From ba5cab576a7615af12389ff159c6ed57b5312d5e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 14:38:04 +0200 Subject: [PATCH 129/680] bug #1405: enable StrictlyLower/StrictlyUpper triangularView as the destination of matrix*matrix products. --- .../Core/products/GeneralMatrixMatrixTriangular.h | 13 +++++++++---- .../products/GeneralMatrixMatrixTriangular_BLAS.h | 2 +- test/product_mmtr.cpp | 11 +++++++++-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index ad38bcf51..e436c50a4 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -269,10 +269,13 @@ struct general_product_to_triangular_selector enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, - RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0, + SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0 }; Index size = mat.cols(); + if(SkipDiag) + size--; Index depth = actualLhs.cols(); typedef internal::gemm_blocking_space internal::general_matrix_matrix_triangular_product + IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)> ::run(size, depth, - &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), + &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking); } }; @@ -294,6 +298,7 @@ template template EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { + EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 5b7c15cca..41e18ff07 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product& blocking) \ { \ - if (lhs==rhs) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \ general_matrix_matrix_rankupdate \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ diff --git a/test/product_mmtr.cpp b/test/product_mmtr.cpp index f6e4bb1ae..d3e24b012 100644 --- a/test/product_mmtr.cpp +++ b/test/product_mmtr.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2010-2017 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,12 +10,19 @@ #include "main.h" #define CHECK_MMTR(DEST, TRI, OP) { \ + ref3 = DEST; \ ref2 = ref1 = DEST; \ DEST.template triangularView() OP; \ ref1 OP; \ ref2.template triangularView() \ = ref1.template triangularView(); \ VERIFY_IS_APPROX(DEST,ref2); \ + \ + DEST = ref3; \ + ref3 = ref2; \ + ref3.diagonal() = DEST.diagonal(); \ + DEST.template triangularView() OP; \ + VERIFY_IS_APPROX(DEST,ref3); \ } template void mmtr(int size) @@ -27,7 +34,7 @@ template void mmtr(int size) MatrixColMaj matc = MatrixColMaj::Zero(size, size); MatrixRowMaj matr = MatrixRowMaj::Zero(size, size); - MatrixColMaj ref1(size, size), ref2(size, size); + MatrixColMaj ref1(size, size), ref2(size, size), ref3(size,size); MatrixColMaj soc(size,othersize); soc.setRandom(); MatrixColMaj osc(othersize,size); osc.setRandom(); From 1bbcf19029ea3ac5493c3511807daa53fcde9e90 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 14:44:02 +0200 Subject: [PATCH 130/680] bug #1403: fix implicit scalar type conversion. --- Eigen/src/QR/ColPivHouseholderQR.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 1b828f5dc..ae7c1775f 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -555,7 +555,7 @@ void ColPivHouseholderQR::computeInPlace() if (m_colNormsUpdated.coeffRef(j) != 0) { RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); - temp = temp < 0 ? 0 : temp; + temp = temp < 0 ? RealScalar(0) : temp; RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { From 731c8c704df3c2a2f5872be712a12821e4acd869 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 15:45:49 +0200 Subject: [PATCH 131/680] bug #1403: more scalar conversions fixes in BDCSVD --- Eigen/src/QR/ColPivHouseholderQR.h | 4 +- Eigen/src/SVD/BDCSVD.h | 81 +++++++++++++------------- Eigen/src/SVD/UpperBidiagonalization.h | 4 +- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index ae7c1775f..5270eaca2 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -552,10 +552,10 @@ void ColPivHouseholderQR::computeInPlace() // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf // and used in LAPACK routines xGEQPF and xGEQP3. // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html - if (m_colNormsUpdated.coeffRef(j) != 0) { + if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) { RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); - temp = temp < 0 ? RealScalar(0) : temp; + temp = temp < RealScalar(0) ? RealScalar(0) : temp; RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 25fca6f4d..d7a4271cb 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -77,6 +77,7 @@ public: typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; + typedef typename NumTraits::Literal Literal; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -259,7 +260,7 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows RealScalar scale = matrix.cwiseAbs().maxCoeff(); - if(scale==RealScalar(0)) scale = RealScalar(1); + if(scale==Literal(0)) scale = Literal(1); MatrixX copy; if (m_isTranspose) copy = matrix.adjoint()/scale; else copy = matrix/scale; @@ -351,13 +352,13 @@ void BDCSVD::structured_update(Block A, co Index k1=0, k2=0; for(Index j=0; j::divide (Index firstCol, Index lastCol, Index firstRowW, l = m_naiveU.row(1).segment(firstCol, k); f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); } - if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1; + if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1); if (r0::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); ArrayRef diag = m_workspace.head(n); - diag(0) = 0; + diag(0) = Literal(0); // Allocate space for singular values and vectors singVals.resize(n); @@ -590,7 +591,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==0) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n; Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -691,7 +692,7 @@ template typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift) { Index m = perm.size(); - RealScalar res = 1; + RealScalar res = Literal(1); for(Index i=0; i::computeSingVals(const ArrayRef& col0, const ArrayRef& d Index n = col0.size(); Index actual_n = n; - while(actual_n>1 && col0(actual_n-1)==0) --actual_n; + while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; for (Index k = 0; k < n; ++k) { - if (col0(k) == 0 || actual_n==1) + if (col0(k) == Literal(0) || actual_n==1) { // if col0(k) == 0, then entry is deflated, so singular value is on diagonal // if actual_n==1, then the deflated problem is already diagonalized singVals(k) = k==0 ? col0(0) : diag(k); - mus(k) = 0; + mus(k) = Literal(0); shifts(k) = k==0 ? col0(0) : diag(k); continue; } @@ -733,13 +734,13 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d { // Skip deflated singular values Index l = k+1; - while(col0(l)==0) { ++l; eigen_internal_assert(l::computeSingVals(const ArrayRef& col0, const ArrayRef& d << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0) << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n"; #endif - RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right; + RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; // measure everything relative to shift Map diagShifted(m_workspace.data()+4*n, n); @@ -785,13 +786,13 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // rational interpolation: fit a function of the form a / mu + b through the two previous // iterates and use its zero to compute the next iterate - bool useBisection = fPrev*fCur>0; - while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) + bool useBisection = fPrev*fCur>Literal(0); + while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) { ++m_numIters; // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples. - RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev); + RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev); RealScalar b = fCur - a / muCur; // And find mu such that f(mu)==0: RealScalar muZero = -a/b; @@ -803,8 +804,8 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d fCur = fZero; - if (shift == left && (muCur < 0 || muCur > right - left)) useBisection = true; - if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true; + if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; + if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; } @@ -841,13 +842,13 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; } #endif - eigen_internal_assert(fLeft * fRight < 0); + eigen_internal_assert(fLeft * fRight < Literal(0)); - while (rightShifted - leftShifted > 2 * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) + while (rightShifted - leftShifted > Literal(2) * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) { - RealScalar midShifted = (leftShifted + rightShifted) / 2; + RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); - if (fLeft * fMid < 0) + if (fLeft * fMid < Literal(0)) { rightShifted = midShifted; } @@ -858,7 +859,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d } } - muCur = (leftShifted + rightShifted) / 2; + muCur = (leftShifted + rightShifted) / Literal(2); } singVals[k] = shift + muCur; @@ -892,8 +893,8 @@ void BDCSVD::perturbCol0 // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { - if (col0(k) == 0) // deflated - zhat(k) = 0; + if (col0(k) == Literal(0)) // deflated + zhat(k) = Literal(0); else { // see equation (3.6) @@ -918,7 +919,7 @@ void BDCSVD::perturbCol0 std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); - zhat(k) = col0(k) > 0 ? tmp : -tmp; + zhat(k) = col0(k) > Literal(0) ? tmp : -tmp; } } } @@ -934,7 +935,7 @@ void BDCSVD::computeSingVecs for (Index k = 0; k < n; ++k) { - if (zhat(k) == 0) + if (zhat(k) == Literal(0)) { U.col(k) = VectorType::Unit(n+1, k); if (m_compV) V.col(k) = VectorType::Unit(n, k); @@ -947,7 +948,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - U(n,k) = 0; + U(n,k) = Literal(0); U.col(k).normalize(); if (m_compV) @@ -958,7 +959,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - V(0,k) = -1; + V(0,k) = Literal(-1); V.col(k).normalize(); } } @@ -980,14 +981,14 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s)); - if (r == 0) + if (r == Literal(0)) { - m_computed(start+i, start+i) = 0; + m_computed(start+i, start+i) = Literal(0); return; } m_computed(start,start) = r; - m_computed(start+i, start) = 0; - m_computed(start+i, start+i) = 0; + m_computed(start+i, start) = Literal(0); + m_computed(start+i, start+i) = Literal(0); JacobiRotation J(c/r,-s/r); if (m_compU) m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J); @@ -1020,7 +1021,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi << m_computed(firstColm + i+1, firstColm+i+1) << " " << m_computed(firstColm + i+2, firstColm+i+2) << "\n"; #endif - if (r==0) + if (r==Literal(0)) { m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j); return; @@ -1029,7 +1030,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi s/=r; m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); - m_computed(firstColm + j, firstColm) = 0; + m_computed(firstColm + j, firstColm) = Literal(0); JacobiRotation J(c,-s); if (m_compU) m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J); @@ -1053,7 +1054,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits::epsilon() * maxDiag); - RealScalar epsilon_coarse = 8 * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); + RealScalar epsilon_coarse = Literal(8) * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert(m_naiveU.allFinite()); @@ -1081,7 +1082,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << " (diag(" << i << ")=" << diag(i) << ")\n"; #endif - col0(i) = 0; + col0(i) = Literal(0); } //condition 4.3 diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 0b1460894..11ac847e1 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -159,6 +159,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, traits::Flags & RowMajorBit> > Y) { typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename NumTraits::Literal Literal; enum { StorageOrder = traits::Flags & RowMajorBit }; typedef InnerStride ColInnerStride; typedef InnerStride RowInnerStride; @@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, SubMatType A10( A.block(bs,0, brows-bs,bs) ); SubMatType A01( A.block(0,bs, bs,bcols-bs) ); Scalar tmp = A01(bs-1,0); - A01(bs-1,0) = 1; + A01(bs-1,0) = Literal(1); A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint(); A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01; A01(bs-1,0) = tmp; From c3e2afce0d2f5b4a3b9d44fc85e5103869299e47 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 16:25:18 +0200 Subject: [PATCH 132/680] Enable MSVC 2010 workaround from MSVC only --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 14ec87da8..755646795 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -955,7 +955,7 @@ namespace Eigen { const typename internal::plain_constant_type::type, const EXPR> // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT<=1600 +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type #else #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X From a4fd4233adbf4132f24bf989e3099022c5f3e207 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Jun 2017 23:02:02 +0200 Subject: [PATCH 133/680] Fix compilation with some compilers --- test/half_float.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/half_float.cpp b/test/half_float.cpp index 7ec40f3ae..7b6208873 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -112,8 +112,8 @@ void test_numtraits() VERIFY_IS_EQUAL( std::numeric_limits::signaling_NaN().x, half(std::numeric_limits::signaling_NaN()).x ); VERIFY( (std::numeric_limits::min)() > half(0.f) ); VERIFY( (std::numeric_limits::denorm_min)() > half(0.f) ); - VERIFY( (std::numeric_limits::min)()/2 > half(0.f) ); - VERIFY_IS_EQUAL( (std::numeric_limits::denorm_min)()/2, half(0.f) ); + VERIFY( (std::numeric_limits::min)()/half(2) > half(0.f) ); + VERIFY_IS_EQUAL( (std::numeric_limits::denorm_min)()/half(2), half(0.f) ); } void test_arithmetic() From 50e09cca0f2312d89ced431b3a6cd7ff7c925d09 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 11 Jun 2017 15:30:36 +0200 Subject: [PATCH 134/680] fix tipo --- doc/AsciiQuickReference.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index 8409f8850..0ca54cef3 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -140,7 +140,7 @@ R.array().abs() // abs(P) R.cwiseAbs2() // abs(P.^2) R.array().abs2() // abs(P.^2) (R.array() < s).select(P,Q ); // (R < s ? P : Q) -R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0) +R = (Q.array()==0).select(P,R) // R(Q==0) = P(Q==0) R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P) // with: scalar func(const scalar &x); From 6dcf966558ff9782f34c5df897c15eb37f1e95e3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 12 Jun 2017 16:47:22 +0200 Subject: [PATCH 135/680] Avoid implicit scalar conversion with accuracy loss in pow(scalar,array) --- Eigen/src/Core/GlobalFunctions.h | 49 +++++++++++++++----------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 12828a7c3..50406400b 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -103,17 +103,18 @@ namespace Eigen inline const CwiseBinaryOp,Derived,Constant > pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) { - return x.derived().pow(exponent); - } - - template - inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) - pow(const Eigen::ArrayBase& x, const typename Derived::Scalar& exponent) { - return x.derived().pow(exponent); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) + pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) + { + typedef typename internal::promote_scalar_arg::type PromotedExponent; + return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), + typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); } #endif @@ -156,21 +157,17 @@ namespace Eigen inline const CwiseBinaryOp,Constant,Derived> pow(const Scalar& x,const Eigen::ArrayBase& x); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), - const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type - pow(const Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); - } - - template - inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) - pow(const typename Derived::Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) + pow(const Scalar& x, const Eigen::ArrayBase& exponents) { + typedef typename internal::promote_scalar_arg::type PromotedScalar; + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op(x)), exponents.derived()); } #endif From c0e1d510fd7086a7e79987b7df0a789a4457c5dc Mon Sep 17 00:00:00 2001 From: Kyle Vedder Date: Wed, 4 Oct 2017 21:01:23 -0500 Subject: [PATCH 136/680] Add support for SuiteSparse's KLU routines --- cmake/FindKLU.cmake | 51 +++ test/CMakeLists.txt | 16 + test/klu_support.cpp | 32 ++ unsupported/Eigen/KLUSupport | 41 ++ unsupported/Eigen/src/KLUSupport/KLUSupport.h | 364 ++++++++++++++++++ 5 files changed, 504 insertions(+) create mode 100644 cmake/FindKLU.cmake create mode 100644 test/klu_support.cpp create mode 100644 unsupported/Eigen/KLUSupport create mode 100644 unsupported/Eigen/src/KLUSupport/KLUSupport.h diff --git a/cmake/FindKLU.cmake b/cmake/FindKLU.cmake new file mode 100644 index 000000000..2783b63d2 --- /dev/null +++ b/cmake/FindKLU.cmake @@ -0,0 +1,51 @@ +# KLU lib usually requires linking to a blas library. +# It is up to the user of this module to find a BLAS and link to it. + +if (KLU_INCLUDES AND KLU_LIBRARIES) + set(KLU_FIND_QUIETLY TRUE) +endif (KLU_INCLUDES AND KLU_LIBRARIES) + +find_path(KLU_INCLUDES + NAMES + klu.h + PATHS + $ENV{KLUDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +if(KLU_LIBRARIES) + + if(NOT KLU_LIBDIR) + get_filename_component(KLU_LIBDIR ${KLU_LIBRARIES} PATH) + endif(NOT KLU_LIBDIR) + + find_library(COLAMD_LIBRARY colamd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(COLAMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${COLAMD_LIBRARY}) + endif () + + find_library(AMD_LIBRARY amd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(AMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY}) + endif () + + find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(SUITESPARSE_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${SUITESPARSE_LIBRARY}) + endif () + + find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(CHOLMOD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${CHOLMOD_LIBRARY}) + endif() + +endif(KLU_LIBRARIES) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(KLU DEFAULT_MSG + KLU_INCLUDES KLU_LIBRARIES) + +mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e73ab92b4..8bd086ce3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,6 +68,17 @@ else() ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ") endif() +find_package(KLU) +if(KLU_FOUND) + add_definitions("-DEIGEN_KLU_SUPPORT") + include_directories(${KLU_INCLUDES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") +else() + ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") +endif() + find_package(SuperLU 4.0) if(SUPERLU_FOUND) add_definitions("-DEIGEN_SUPERLU_SUPPORT") @@ -297,6 +308,11 @@ if(UMFPACK_FOUND) ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") endif() +if(KLU_FOUND OR SuiteSparse_FOUND) + message("ADDING KLU TEST") + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") +endif() + if(SUPERLU_FOUND) ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") endif() diff --git a/test/klu_support.cpp b/test/klu_support.cpp new file mode 100644 index 000000000..8b1fdeb41 --- /dev/null +++ b/test/klu_support.cpp @@ -0,0 +1,32 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS +#include "sparse_solver.h" + +#include + +template void test_klu_support_T() +{ + KLU > klu_colmajor; + KLU > klu_rowmajor; + + check_sparse_square_solving(klu_colmajor); + check_sparse_square_solving(klu_rowmajor); + + //check_sparse_square_determinant(umfpack_colmajor); + //check_sparse_square_determinant(umfpack_rowmajor); +} + +void test_klu_support() +{ + CALL_SUBTEST_1(test_klu_support_T()); + CALL_SUBTEST_2(test_klu_support_T >()); +} + diff --git a/unsupported/Eigen/KLUSupport b/unsupported/Eigen/KLUSupport new file mode 100644 index 000000000..b23d90535 --- /dev/null +++ b/unsupported/Eigen/KLUSupport @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_MODULE_H +#define EIGEN_KLUSUPPORT_MODULE_H + +#include + +#include + +extern "C" { +#include +#include + } + +/** \ingroup Support_modules + * \defgroup KLUSupport_Module KLUSupport module + * + * This module provides an interface to the KLU library which is part of the suitesparse package. + * It provides the following factorization class: + * - class KLU: a sparse LU factorization, well-suited for circuit simulation. + * + * \code + * #include + * \endcode + * + * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies. + * The dependencies depend on how umfpack has been compiled. + * For a cmake based project, you can use our FindKLU.cmake module to help you in this task. + * + */ + +#include "src/KLUSupport/KLUSupport.h" + +#include + +#endif // EIGEN_KLUSUPPORT_MODULE_H diff --git a/unsupported/Eigen/src/KLUSupport/KLUSupport.h b/unsupported/Eigen/src/KLUSupport/KLUSupport.h new file mode 100644 index 000000000..d2781202e --- /dev/null +++ b/unsupported/Eigen/src/KLUSupport/KLUSupport.h @@ -0,0 +1,364 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Kyle Macfarlan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_H +#define EIGEN_KLUSUPPORT_H + +namespace Eigen { + +/* TODO extract L, extract U, compute det, etc... */ + +/** \ingroup KLUSupport_Module + * \brief A sparse LU factorization and solver based on KLU + * + * This class allows to solve for A.X = B sparse linear problems via a LU factorization + * using the KLU library. The sparse matrix A must be squared and full rank. + * The vectors or matrices X and B can be either dense or sparse. + * + * \warning The input matrix A should be in a \b compressed and \b column-major form. + * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. + * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \implsparsesolverconcept + * + * \sa \ref TutorialSparseSolverConcept, class SparseLU + */ + + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, double B [ ], klu_common *Common, double) { + return klu_solve(Symbolic, Numeric, ldim, nrhs, B, Common); +} + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_solve(Symbolic, Numeric, ldim, nrhs, &numext::real_ref(B[0]), Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, double B[], klu_common *Common, double) { + return klu_tsolve(Symbolic, Numeric, ldim, nrhs, B, Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_tsolve(Symbolic, Numeric, ldim, nrhs, &numext::real_ref(B[0]), 0, Common); +} + +inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { + return klu_factor(Ap, Ai, Ax, Symbolic, Common); +} + +inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex) { + return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common); +} + + +template +class KLU : public SparseSolverBase > +{ + protected: + typedef SparseSolverBase > Base; + using Base::m_isInitialized; + public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix Vector; + typedef Matrix IntRowVectorType; + typedef Matrix IntColVectorType; + typedef SparseMatrix LUMatrixType; + typedef SparseMatrix KLUMatrixType; + typedef Ref KLUMatrixRef; + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + public: + + KLU() + : m_dummy(0,0), mp_matrix(m_dummy) + { + init(); + } + + template + explicit KLU(const InputMatrixType& matrix) + : mp_matrix(matrix) + { + init(); + compute(matrix); + } + + ~KLU() + { + if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common); + if(m_numeric) klu_free_numeric(&m_numeric,&m_common); + } + + inline Index rows() const { return mp_matrix.rows(); } + inline Index cols() const { return mp_matrix.cols(); } + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was succesful, + * \c NumericalIssue if the matrix.appears to be negative. + */ + ComputationInfo info() const + { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } + + inline const LUMatrixType& matrixL() const + { + if (m_extractedDataAreDirty) extractData(); + return m_l; + } + + inline const LUMatrixType& matrixU() const + { + if (m_extractedDataAreDirty) extractData(); + return m_u; + } + + inline const IntColVectorType& permutationP() const + { + if (m_extractedDataAreDirty) extractData(); + return m_p; + } + + inline const IntRowVectorType& permutationQ() const + { + if (m_extractedDataAreDirty) extractData(); + return m_q; + } + + /** Computes the sparse Cholesky decomposition of \a matrix + * Note that the matrix should be column-major, and in compressed format for best performance. + * \sa SparseMatrix::makeCompressed(). + */ + template + void compute(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + grab(matrix.derived()); + analyzePattern_impl(); + factorize_impl(); + } + + /** Performs a symbolic decomposition on the sparcity of \a matrix. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize(), compute() + */ + template + void analyzePattern(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + + grab(matrix.derived()); + + analyzePattern_impl(); + } + + + /** Provides access to the control settings array used by KLU. + * + * See KLU documentation for details. + */ + inline const klu_common& kluCommon() const + { + return m_common; + } + + /** Provides access to the control settings array used by UmfPack. + * + * If this array contains NaN's, the default values are used. + * + * See KLU documentation for details. + */ + inline klu_common& kluCommon() + { + return m_common; + } + + /** Performs a numeric decomposition of \a matrix + * + * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. + * + * \sa analyzePattern(), compute() + */ + template + void factorize(const InputMatrixType& matrix) + { + eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()"); + if(m_numeric) + klu_free_numeric(&m_numeric,&m_common); + + grab(matrix.derived()); + + factorize_impl(); + } + + /** \internal */ + template + bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; + + Scalar determinant() const; + + void extractData() const; + + protected: + + void init() + { + m_info = InvalidInput; + m_isInitialized = false; + m_numeric = 0; + m_symbolic = 0; + m_extractedDataAreDirty = true; + + klu_defaults(&m_common); + } + + void analyzePattern_impl() + { + m_info = InvalidInput; + m_analysisIsOk = false; + m_factorizationIsOk = false; + m_symbolic = klu_analyze(internal::convert_index(mp_matrix.rows()), + const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), + &m_common); + if (m_symbolic) { + m_isInitialized = true; + m_info = Success; + m_analysisIsOk = true; + m_extractedDataAreDirty = true; + } + } + + void factorize_impl() + { + + m_numeric = klu_factor(const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), const_cast(mp_matrix.valuePtr()), + m_symbolic, &m_common, Scalar()); + + + m_info = m_numeric ? Success : NumericalIssue; + m_factorizationIsOk = m_numeric ? 1 : 0; + m_extractedDataAreDirty = true; + } + + template + void grab(const EigenBase &A) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A.derived()); + } + + void grab(const KLUMatrixRef &A) + { + if(&(A.derived()) != &mp_matrix) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A); + } + } + + // cached data to reduce reallocation, etc. + mutable LUMatrixType m_l; + + mutable LUMatrixType m_u; + mutable IntColVectorType m_p; + mutable IntRowVectorType m_q; + + KLUMatrixType m_dummy; + KLUMatrixRef mp_matrix; + + klu_numeric* m_numeric; + klu_symbolic* m_symbolic; + klu_common m_common; + mutable ComputationInfo m_info; + int m_factorizationIsOk; + int m_analysisIsOk; + mutable bool m_extractedDataAreDirty; + + private: + KLU(const KLU& ) { } +}; + + +template +void KLU::extractData() const +{ + if (m_extractedDataAreDirty) + { + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + +// // get size of the data +// int lnz, unz, rows, cols, nz_udiag; +// umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); +// +// // allocate data +// m_l.resize(rows,(std::min)(rows,cols)); +// m_l.resizeNonZeros(lnz); +// +// m_u.resize((std::min)(rows,cols),cols); +// m_u.resizeNonZeros(unz); +// +// m_p.resize(rows); +// m_q.resize(cols); +// +// // extract +// umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), +// m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), +// m_p.data(), m_q.data(), 0, 0, 0, m_numeric); +// +// m_extractedDataAreDirty = false; + } +} + +template +typename KLU::Scalar KLU::determinant() const +{ + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + return Scalar(); +} + +template +template +bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const +{ + Index rhsCols = b.cols(); + eigen_assert((BDerived::Flags&RowMajorBit)==0 && "KLU backend does not support non col-major rhs yet"); + eigen_assert((XDerived::Flags&RowMajorBit)==0 && "KLU backend does not support non col-major result yet"); + eigen_assert(b.derived().data() != x.derived().data() && " KLU does not support inplace solve"); + eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); + + x = b; + int info = 0; + if (true/*(MatrixType::Flags&RowMajorBit) == 0*/) + { + info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + } + else + { + info = klu_tsolve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + } + + m_info = info!=0 ? Success : NumericalIssue; + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_KLUSUPPORT_H From a020d9b134ed804574845c0949c3b988ed56bcb5 Mon Sep 17 00:00:00 2001 From: Justin Carpentier Date: Tue, 17 Oct 2017 21:51:27 +0200 Subject: [PATCH 137/680] Use col method for column-major matrix --- Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index facdaf890..f66c846ef 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -168,7 +168,7 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { for(Index j=0; jRealScalar(0)) m_invdiag(j) = RealScalar(1)/sum; else From 5fa79f96b8df069a5babb4e022fa43529062686f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Nov 2017 10:58:38 +0100 Subject: [PATCH 138/680] Patch from Konstantin Arturov to enable MKL's direct call by default --- Eigen/src/Core/util/MKL_support.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index b7d6ecc76..014ba1e3e 100755 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -55,7 +55,11 @@ #if defined EIGEN_USE_MKL -# include +# ifndef MKL_DIRECT_CALL +# define MKL_DIRECT_CALL +# define MKL_DIRECT_CALL_JUST_SET +# endif +# include /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ # ifndef INTEL_MKL_VERSION # undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ @@ -69,6 +73,9 @@ # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT # undef EIGEN_USE_LAPACKE +# ifdef MKL_DIRECT_CALL_JUST_SET +# undef MKL_DIRECT_CALL +# endif # endif #endif From f86bb89d39e6611806c3b31719509e661120f3a1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 9 Nov 2017 11:07:45 +0100 Subject: [PATCH 139/680] Add EIGEN_MKL_NO_DIRECT_CALL option --- Eigen/src/Core/util/MKL_support.h | 2 +- doc/UsingIntelMKL.dox | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 014ba1e3e..17963fad4 100755 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -55,7 +55,7 @@ #if defined EIGEN_USE_MKL -# ifndef MKL_DIRECT_CALL +# if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL) # define MKL_DIRECT_CALL # define MKL_DIRECT_CALL_JUST_SET # endif diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index 6de14afad..fc35c3cf0 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -63,7 +63,11 @@ In addition you can choose which parts will be substituted by defining one or mu \c EIGEN_USE_MKL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML -The options can be combined with \b MKL_DIRECT_CALL to enable MKL direct call feature. This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. To make it work properly, the macro \c EIGEN_USE_MKL must also be defined in the case none of the other \c EIGEN_USE_MKL_* macros has been defined. +The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE* macros can be combined with \c EIGEN_USE_MKL to explicitly tell Eigen that the underlying BLAS/Lapack implementation is Intel MKL. +The main effect is to enable MKL direct call feature (\c MKL_DIRECT_CALL). +This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. +MKL direct call can be disabled by defining \c EIGEN_MKL_NO_DIRECT_CALL. + Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details. From 0a1cc7394226c7439b586f5bac3e94cf287622f1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Nov 2017 10:25:41 +0100 Subject: [PATCH 140/680] bug #1484: restore deleted line for 128 bits long doubles, and improve dispatching logic. --- .../src/MatrixFunctions/MatrixExponential.h | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index bb6d9e1fe..85ab3d97c 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -326,6 +326,7 @@ struct matrix_exp_computeUV } else if (l1norm < 1.125358383453143065081397882891878e+000L) { matrix_exp_pade13(arg, U, V); } else { + const long double maxnorm = 2.884233277829519311757165057717815L; frexp(l1norm / maxnorm, &squarings); if (squarings < 0) squarings = 0; MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp(squarings)); @@ -342,6 +343,27 @@ struct matrix_exp_computeUV } }; +template struct is_exp_known_type : false_type {}; +template<> struct is_exp_known_type : true_type {}; +template<> struct is_exp_known_type : true_type {}; +#if LDBL_MANT_DIG <= 112 +template<> struct is_exp_known_type : true_type {}; +#endif + +template +void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type +{ + typedef typename ArgType::PlainObject MatrixType; + MatrixType U, V; + int squarings; + matrix_exp_computeUV::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) + MatrixType numer = U + V; + MatrixType denom = -U + V; + result = denom.partialPivLu().solve(numer); + for (int i=0; i * \param result variable in which result will be stored */ template -void matrix_exp_compute(const ArgType& arg, ResultType &result) +void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default { typedef typename ArgType::PlainObject MatrixType; -#if LDBL_MANT_DIG > 112 // rarely happens typedef typename traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename std::complex ComplexScalar; - if (sizeof(RealScalar) > 14) { - result = arg.matrixFunction(internal::stem_function_exp); - return; - } -#endif - MatrixType U, V; - int squarings; - matrix_exp_computeUV::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) - MatrixType numer = U + V; - MatrixType denom = -U + V; - result = denom.partialPivLu().solve(numer); - for (int i=0; i); } } // end namespace Eigen::internal @@ -402,7 +411,7 @@ template struct MatrixExponentialReturnValue inline void evalTo(ResultType& result) const { const typename internal::nested_eval::type tmp(m_src); - internal::matrix_exp_compute(tmp, result); + internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type()); } Index rows() const { return m_src.rows(); } From 1b2dcf9a4742644e86a3da10d8b027c841ae9861 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Nov 2017 10:26:09 +0100 Subject: [PATCH 141/680] Check that Schur decomposition succeed. --- unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index 3f7d77710..ef50c46a9 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -428,7 +428,8 @@ struct matrix_function_compute typedef internal::traits Traits; // compute Schur decomposition of A - const ComplexSchur schurOfA(A); + const ComplexSchur schurOfA(A); + eigen_assert(schurOfA.info()==Success); MatrixType T = schurOfA.matrixT(); MatrixType U = schurOfA.matrixU(); From 6365f937d61ea0d1d7d7d21984f328a5965f2e92 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Nov 2017 13:58:52 +0100 Subject: [PATCH 142/680] KLU depends on BTF but not on libSuiteSparse nor Cholmod --- cmake/FindKLU.cmake | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cmake/FindKLU.cmake b/cmake/FindKLU.cmake index 2783b63d2..4a8f8e0b0 100644 --- a/cmake/FindKLU.cmake +++ b/cmake/FindKLU.cmake @@ -16,6 +16,8 @@ find_path(KLU_INCLUDES ufsparse ) +find_library(KLU_LIBRARIES klu PATHS $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(KLU_LIBRARIES) if(NOT KLU_LIBDIR) @@ -32,14 +34,9 @@ if(KLU_LIBRARIES) set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY}) endif () - find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) - if(SUITESPARSE_LIBRARY) - set(KLU_LIBRARIES ${KLU_LIBRARIES} ${SUITESPARSE_LIBRARY}) - endif () - - find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) - if(CHOLMOD_LIBRARY) - set(KLU_LIBRARIES ${KLU_LIBRARIES} ${CHOLMOD_LIBRARY}) + find_library(BTF_LIBRARY btf PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(BTF_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${BTF_LIBRARY}) endif() endif(KLU_LIBRARIES) @@ -48,4 +45,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(KLU DEFAULT_MSG KLU_INCLUDES KLU_LIBRARIES) -mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY) +mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY BTF_LIBRARY) From b82cd93c01f60917c180d81d6321777f92e78464 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Nov 2017 14:09:01 +0100 Subject: [PATCH 143/680] KLU: truely disable unimplemented code, add proper static assertions in solve --- unsupported/Eigen/src/KLUSupport/KLUSupport.h | 84 +++++++++---------- 1 file changed, 39 insertions(+), 45 deletions(-) diff --git a/unsupported/Eigen/src/KLUSupport/KLUSupport.h b/unsupported/Eigen/src/KLUSupport/KLUSupport.h index d2781202e..a9e8633d9 100644 --- a/unsupported/Eigen/src/KLUSupport/KLUSupport.h +++ b/unsupported/Eigen/src/KLUSupport/KLUSupport.h @@ -27,24 +27,24 @@ namespace Eigen { * * \implsparsesolverconcept * - * \sa \ref TutorialSparseSolverConcept, class SparseLU + * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU */ -inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, double B [ ], klu_common *Common, double) { - return klu_solve(Symbolic, Numeric, ldim, nrhs, B, Common); +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) { + return klu_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); } -inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, std::complexB[], klu_common *Common, std::complex) { - return klu_z_solve(Symbolic, Numeric, ldim, nrhs, &numext::real_ref(B[0]), Common); +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), Common); } -inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, double B[], klu_common *Common, double) { - return klu_tsolve(Symbolic, Numeric, ldim, nrhs, B, Common); +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) { + return klu_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); } -inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, int ldim, int nrhs, std::complexB[], klu_common *Common, std::complex) { - return klu_z_tsolve(Symbolic, Numeric, ldim, nrhs, &numext::real_ref(B[0]), 0, Common); +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), 0, Common); } inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { @@ -114,7 +114,7 @@ class KLU : public SparseSolverBase > eigen_assert(m_isInitialized && "Decomposition is not initialized."); return m_info; } - +#if 0 // not implemented yet inline const LUMatrixType& matrixL() const { if (m_extractedDataAreDirty) extractData(); @@ -138,7 +138,7 @@ class KLU : public SparseSolverBase > if (m_extractedDataAreDirty) extractData(); return m_q; } - +#endif /** Computes the sparse Cholesky decomposition of \a matrix * Note that the matrix should be column-major, and in compressed format for best performance. * \sa SparseMatrix::makeCompressed(). @@ -213,9 +213,11 @@ class KLU : public SparseSolverBase > template bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; +#if 0 // not implemented yet Scalar determinant() const; void extractData() const; +#endif protected: @@ -275,11 +277,12 @@ class KLU : public SparseSolverBase > } // cached data to reduce reallocation, etc. +#if 0 // not implemented yet mutable LUMatrixType m_l; - mutable LUMatrixType m_u; mutable IntColVectorType m_p; mutable IntRowVectorType m_q; +#endif KLUMatrixType m_dummy; KLUMatrixRef mp_matrix; @@ -296,7 +299,7 @@ class KLU : public SparseSolverBase > KLU(const KLU& ) { } }; - +#if 0 // not implemented yet template void KLU::extractData() const { @@ -304,26 +307,26 @@ void KLU::extractData() const { eigen_assert(false && "KLU: extractData Not Yet Implemented"); -// // get size of the data -// int lnz, unz, rows, cols, nz_udiag; -// umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); -// -// // allocate data -// m_l.resize(rows,(std::min)(rows,cols)); -// m_l.resizeNonZeros(lnz); -// -// m_u.resize((std::min)(rows,cols),cols); -// m_u.resizeNonZeros(unz); -// -// m_p.resize(rows); -// m_q.resize(cols); -// -// // extract -// umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), -// m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), -// m_p.data(), m_q.data(), 0, 0, 0, m_numeric); -// -// m_extractedDataAreDirty = false; + // get size of the data + int lnz, unz, rows, cols, nz_udiag; + umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); + + // allocate data + m_l.resize(rows,(std::min)(rows,cols)); + m_l.resizeNonZeros(lnz); + + m_u.resize((std::min)(rows,cols),cols); + m_u.resizeNonZeros(unz); + + m_p.resize(rows); + m_q.resize(cols); + + // extract + umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), + m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), + m_p.data(), m_q.data(), 0, 0, 0, m_numeric); + + m_extractedDataAreDirty = false; } } @@ -333,27 +336,18 @@ typename KLU::Scalar KLU::determinant() const eigen_assert(false && "KLU: extractData Not Yet Implemented"); return Scalar(); } +#endif template template bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const { Index rhsCols = b.cols(); - eigen_assert((BDerived::Flags&RowMajorBit)==0 && "KLU backend does not support non col-major rhs yet"); - eigen_assert((XDerived::Flags&RowMajorBit)==0 && "KLU backend does not support non col-major result yet"); - eigen_assert(b.derived().data() != x.derived().data() && " KLU does not support inplace solve"); + EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); x = b; - int info = 0; - if (true/*(MatrixType::Flags&RowMajorBit) == 0*/) - { - info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); - } - else - { - info = klu_tsolve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); - } + int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); m_info = info!=0 ? Success : NumericalIssue; return true; From 00bc67c374605c75d2cee4a0831885ac25e99b54 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Nov 2017 14:11:22 +0100 Subject: [PATCH 144/680] Move KLU support to official --- {unsupported/Eigen => Eigen}/KLUSupport | 0 {unsupported/Eigen => Eigen}/src/KLUSupport/KLUSupport.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {unsupported/Eigen => Eigen}/KLUSupport (100%) rename {unsupported/Eigen => Eigen}/src/KLUSupport/KLUSupport.h (100%) diff --git a/unsupported/Eigen/KLUSupport b/Eigen/KLUSupport similarity index 100% rename from unsupported/Eigen/KLUSupport rename to Eigen/KLUSupport diff --git a/unsupported/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h similarity index 100% rename from unsupported/Eigen/src/KLUSupport/KLUSupport.h rename to Eigen/src/KLUSupport/KLUSupport.h From 103c0aa6ad83d641625d33cb9e1edade45dc6910 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Nov 2017 14:13:29 +0100 Subject: [PATCH 145/680] Add KLU in the list of third-party sparse solvers --- doc/SparseLinearSystems.dox | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index fc33b93e7..38754e4af 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -70,6 +70,9 @@ They are summarized in the following tables: UmfPackLU\link UmfPackSupport_Module UmfPackSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra Requires the SuiteSparse package, \b GPL +KLU\link KLUSupport_Module KLUSupport \endlinkDirect LU factorizationSquareFill-in reducing, suitted for circuit simulation + Requires the SuiteSparse package, \b GPL + SuperLU\link SuperLUSupport_Module SuperLUSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra Requires the SuperLU library, (BSD-like) From 7cc503f9f57f6d3524115b222b62e584a04ff3ef Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 15 Nov 2017 21:33:37 +0100 Subject: [PATCH 146/680] bug #1485: fix linking issue of non template functions --- Eigen/src/PaStiXSupport/PaStiXSupport.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index d2ebfd7bb..160d8a523 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -64,28 +64,28 @@ namespace internal typedef typename _MatrixType::StorageIndex StorageIndex; }; - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast(vals), perm, invp, reinterpret_cast(x), nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} From 7f42a93349c7ed175fda5a684f603dfdcf6724e8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 15 Nov 2017 20:45:09 +0000 Subject: [PATCH 147/680] Merged in alainvaucher/eigen/find-module-imported-target (pull request PR-324) In the CMake find module, define the Eigen imported target as when installing with CMake * In the CMake find module, define the Eigen imported target * Add quotes to the imported location, in case there are spaces in the path. Approved-by: Alain Vaucher --- cmake/FindEigen3.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake index 9e9697860..657440ba5 100644 --- a/cmake/FindEigen3.cmake +++ b/cmake/FindEigen3.cmake @@ -10,6 +10,10 @@ # EIGEN3_INCLUDE_DIR - the eigen include directory # EIGEN3_VERSION - eigen version # +# and the following imported target: +# +# Eigen3::Eigen - The header-only Eigen library +# # This module reads hints about search locations from # the following enviroment variables: # @@ -95,3 +99,8 @@ else (EIGEN3_INCLUDE_DIR) endif(EIGEN3_INCLUDE_DIR) +if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen) + add_library(Eigen3::Eigen INTERFACE IMPORTED) + set_target_properties(Eigen3::Eigen PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}") +endif() From 3f7fb5a6d63b0f0a278fa4d889f86d8001a5f45e Mon Sep 17 00:00:00 2001 From: Androbin42 Date: Mon, 12 Jun 2017 17:07:56 +0000 Subject: [PATCH 148/680] Make eigen_monitor_perf.sh more robust --- scripts/eigen_monitor_perf.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/eigen_monitor_perf.sh b/scripts/eigen_monitor_perf.sh index 39f8e7ecd..8f3425daf 100755 --- a/scripts/eigen_monitor_perf.sh +++ b/scripts/eigen_monitor_perf.sh @@ -12,9 +12,9 @@ export CXX_FLAGS="-mfma -w" #### BENCH_PATH=$EIGEN_SOURCE_PATH/bench/perf_monitoring/$PREFIX -PREVPATH=`pwd` -cd $EIGEN_SOURCE_PATH/bench/perf_monitoring && ./runall.sh "Haswell 2.6GHz, FMA, Apple's clang" $* -cd $PREVPATH +PREVPATH=$(pwd) +cd $EIGEN_SOURCE_PATH/bench/perf_monitoring && ./runall.sh "Haswell 2.6GHz, FMA, Apple's clang" "$@" +cd $PREVPATH || exit 1 ALLFILES="$BENCH_PATH/*.png $BENCH_PATH/*.html $BENCH_PATH/index.html $BENCH_PATH/s1.js $BENCH_PATH/s2.js" From 95ecb2b5d670f80133ffc3fb34215ab7e33288ac Mon Sep 17 00:00:00 2001 From: Androbin42 Date: Mon, 12 Jun 2017 17:11:06 +0000 Subject: [PATCH 149/680] Make buildtests.in more robust --- scripts/buildtests.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/buildtests.in b/scripts/buildtests.in index 526d5b74b..ab9c18fb1 100755 --- a/scripts/buildtests.in +++ b/scripts/buildtests.in @@ -10,7 +10,7 @@ then fi TESTSLIST="@EIGEN_TESTS_LIST@" -targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo` +targets_to_make=$(echo "$TESTSLIST" | grep -E "$1" | xargs echo) if [ -n "${EIGEN_MAKE_ARGS:+x}" ] then From d9d7bd6d6215039f25d443c53a8f95cd760352e8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 12 Jun 2017 22:25:02 +0200 Subject: [PATCH 150/680] Fix 1x1 case in Solve expression with EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION==RowMajor --- Eigen/src/Core/Solve.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h index 960a58597..a8daea511 100644 --- a/Eigen/src/Core/Solve.h +++ b/Eigen/src/Core/Solve.h @@ -34,12 +34,12 @@ template struct s template struct solve_traits { - typedef Matrix PlainObject; + RhsType::MaxColsAtCompileTime>::type PlainObject; }; template From e43d8fe9d778929051e740ef9a9143b6a2f26482 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 12 Jun 2017 22:26:26 +0200 Subject: [PATCH 151/680] Fix compilation of streaming nested Array, i.e., cout << Array> --- Eigen/src/Core/NumTraits.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index aebc0c259..92a9ae1ea 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -215,6 +215,8 @@ struct NumTraits > static inline RealScalar epsilon() { return NumTraits::epsilon(); } EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + + static inline int digits10() { return NumTraits::digits10(); } }; template<> struct NumTraits From 9fbdf020597cd198e3686ca786172aec6f009db6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 12 Jun 2017 22:30:32 +0200 Subject: [PATCH 152/680] Enable Array(EigenBase<>) ctor for compatible scalar types only. This prevents nested arrays to look as being convertible from/to simple arrays. --- Eigen/src/Core/Array.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 0d34269fd..e10020d4f 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -231,10 +231,16 @@ class Array : Base(other) { } + private: + struct PrivateType {}; + public: + /** \sa MatrixBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const EigenBase &other) + EIGEN_STRONG_INLINE Array(const EigenBase &other, + typename internal::enable_if::value, + PrivateType>::type = PrivateType()) : Base(other.derived()) { } From 449936828c3d7f97b90a0cc3d25a1ec7eaec79ff Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 13 Jun 2017 12:54:57 -0700 Subject: [PATCH 153/680] Added missing __device__ qualifier --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index b9a125b42..f4ae3c3c5 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -54,7 +54,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* return __halves2half2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { +template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { return __halves2half2(from[0], from[0]); } From b240080e6443e4fb203ac1cba0ad5bd4fdad56b3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 15 Jun 2017 10:16:30 +0200 Subject: [PATCH 154/680] bug #1436: fix compilation of Jacobi rotations with ARM NEON, some specializations of internal::conj_helper were missing. --- Eigen/Core | 1 + Eigen/src/Core/arch/AVX/Complex.h | 36 ++---------------------- Eigen/src/Core/arch/AltiVec/Complex.h | 35 ++--------------------- Eigen/src/Core/arch/Default/ConjHelper.h | 29 +++++++++++++++++++ Eigen/src/Core/arch/NEON/Complex.h | 4 +++ Eigen/src/Core/arch/SSE/Complex.h | 36 ++---------------------- Eigen/src/Core/arch/ZVector/Complex.h | 3 ++ 7 files changed, 43 insertions(+), 101 deletions(-) create mode 100644 Eigen/src/Core/arch/Default/ConjHelper.h diff --git a/Eigen/Core b/Eigen/Core index d18835613..a16942e19 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -371,6 +371,7 @@ using std::ptrdiff_t; #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" +#include "src/Core/arch/Default/ConjHelper.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 99439c8aa..7fa61969d 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -204,23 +204,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const - { return Packet4cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const - { return Packet4cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { @@ -400,23 +384,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const - { return Packet2cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const - { return Packet2cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 67db2f8ee..3e665730c 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -224,23 +224,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -416,23 +400,8 @@ template<> struct conj_helper return pconj(internal::pmul(a, b)); } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h new file mode 100644 index 000000000..4cfe34e05 --- /dev/null +++ b/Eigen/src/Core/arch/Default/ConjHelper.h @@ -0,0 +1,29 @@ + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_CONJ_HELPER_H +#define EIGEN_ARCH_CONJ_HELPER_H + +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ + }; \ + \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ + }; + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 57e9b431f..ef50ba303 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -265,6 +265,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON @@ -456,6 +458,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 5607fe0ab..23e717f28 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -229,23 +229,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -430,23 +414,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index d39d2d105..1bfb73397 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -336,6 +336,9 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec From 157040d44f5ecea286fecb37cf376cfb42e1ff65 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 20 Jun 2017 16:58:03 +0200 Subject: [PATCH 155/680] Make sure CMAKE_Fortran_COMPILER is set before checking for Fortran functions --- cmake/FindBLAS.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake index 9f74b07fe..e3395bc10 100644 --- a/cmake/FindBLAS.cmake +++ b/cmake/FindBLAS.cmake @@ -152,7 +152,7 @@ set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) # Check the language being used get_property( _LANGUAGES_ GLOBAL PROPERTY ENABLED_LANGUAGES ) -if( _LANGUAGES_ MATCHES Fortran ) +if( _LANGUAGES_ MATCHES Fortran AND CMAKE_Fortran_COMPILER) set( _CHECK_FORTRAN TRUE ) elseif( (_LANGUAGES_ MATCHES C) OR (_LANGUAGES_ MATCHES CXX) ) set( _CHECK_FORTRAN FALSE ) From b651ce0ffa1e5a7f0c6414689d0e1ebb4bd1621c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Jun 2017 09:58:28 +0200 Subject: [PATCH 156/680] Fix a gcc7 warning: Wint-in-bool-context --- Eigen/src/Core/AssignEvaluator.h | 4 +- Eigen/src/Core/util/StaticAssert.h | 116 ++++++++++++++--------------- 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index b0ec7b7ca..dbe435d86 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -39,7 +39,7 @@ public: enum { DstAlignment = DstEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment, - DstHasDirectAccess = DstFlags & DirectAccessBit, + DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) }; @@ -83,7 +83,7 @@ private: && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), - MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 983361a45..cb1678900 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -44,64 +44,64 @@ struct static_assertion { enum { - YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX, - YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES, - YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES, - THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, - OUT_OF_RANGE_ACCESS, - YOU_MADE_A_PROGRAMMING_MISTAKE, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, - EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, - YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR, - YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR, - UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC, - THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES, - FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED, - NUMERIC_TYPE_MUST_BE_REAL, - COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED, - WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED, - THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE, - INVALID_MATRIX_PRODUCT, - INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS, - INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION, - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY, - THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES, - THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES, - INVALID_MATRIX_TEMPLATE_PARAMETERS, - INVALID_MATRIXBASE_TEMPLATE_PARAMETERS, - BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, - THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, - THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES, - YOU_ALREADY_SPECIFIED_THIS_STRIDE, - INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION, - THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD, - PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1, - THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS, - YOU_CANNOT_MIX_ARRAYS_AND_MATRICES, - YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION, - THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY, - YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT, - THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS, - THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL, - THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES, - YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED, - YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED, - THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE, - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, - OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, - IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, - THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, - THIS_TYPE_IS_NOT_SUPPORTED, - STORAGE_KIND_MUST_MATCH, - STORAGE_INDEX_MUST_MATCH, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1, + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1, + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1, + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1, + OUT_OF_RANGE_ACCESS=1, + YOU_MADE_A_PROGRAMMING_MISTAKE=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1, + EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1, + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1, + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1, + UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1, + THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1, + NUMERIC_TYPE_MUST_BE_REAL=1, + COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1, + WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1, + THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1, + INVALID_MATRIX_PRODUCT=1, + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1, + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1, + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1, + THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1, + INVALID_MATRIX_TEMPLATE_PARAMETERS=1, + INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1, + BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1, + THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1, + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1, + YOU_ALREADY_SPECIFIED_THIS_STRIDE=1, + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1, + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1, + THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1, + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1, + YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1, + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1, + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1, + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1, + THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1, + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1, + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1, + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1, + THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1, + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1, + OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1, + IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1, + STORAGE_LAYOUT_DOES_NOT_MATCH=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1, + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1, + THIS_TYPE_IS_NOT_SUPPORTED=1, + STORAGE_KIND_MUST_MATCH=1, + STORAGE_INDEX_MUST_MATCH=1, + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1 }; }; From 561f77707525644a10cbd971ad14bf47aa0e93e0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 27 Jun 2017 12:05:17 +0200 Subject: [PATCH 157/680] Fix a gcc7 warning about bool * bool in abs2 default implementation. --- Eigen/src/Core/MathFunctions.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 0be4a25da..f5635c076 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1059,6 +1059,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); } +EIGEN_DEVICE_FUNC +inline bool abs2(bool x) { return x; } + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) From c010b173608550d515f0aafd95cf8488af546e1a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 27 Jun 2017 14:29:57 +0200 Subject: [PATCH 158/680] Fix warning --- test/integer_types.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integer_types.cpp b/test/integer_types.cpp index a21f73a81..25126315a 100644 --- a/test/integer_types.cpp +++ b/test/integer_types.cpp @@ -162,8 +162,8 @@ void test_integer_types() VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); if(sizeof(long)>sizeof(int)) { - VERIFY(internal::scalar_div_cost::value > internal::scalar_div_cost::value); - VERIFY(internal::scalar_div_cost::value > internal::scalar_div_cost::value); + VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); + VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); } #endif } From 53725c10b80dabd2a536f66e854c50f892496946 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 28 Jun 2017 17:55:23 +0000 Subject: [PATCH 159/680] Merged in mehdi_goli/opencl/DataDependancy (pull request PR-10) DataDependancy * Wrapping data type to the pointer class for sycl in non-terminal nodes; not having that breaks Tensorflow Conv2d code. * Applying Ronnan's Comments. * Applying benoit's comments --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 3 ++- .../CXX11/src/Tensor/TensorBroadcasting.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 3 ++- .../CXX11/src/Tensor/TensorConcatenation.h | 4 +++- .../CXX11/src/Tensor/TensorContraction.h | 4 +++- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 3 ++- .../CXX11/src/Tensor/TensorConvolution.h | 4 +++- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 7 ++++-- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 10 ++++---- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 18 +++++++++++--- .../Eigen/CXX11/src/Tensor/TensorFFT.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 4 +++- .../src/Tensor/TensorForwardDeclarations.h | 16 +++++++++++++ .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 9 ++++--- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorScan.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 24 +++++++++++++++---- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 4 ++++ .../CXX11/src/Tensor/TensorVolumePatch.h | 3 ++- 30 files changed, 118 insertions(+), 40 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 166be200c..027305586 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -34,6 +34,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const std::size_t NumDimensions = internal::traits::NumDimensions; static const int Layout = internal::traits::Layout; + typedef typename traits::PointerType PointerType; enum { Flags = 0 @@ -168,7 +169,7 @@ struct TensorEvaluator, Device> /// required by sycl in order to extract the accessor const TensorEvaluator& right_impl() const { return m_rightImpl; } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return m_leftImpl.data(); } private: TensorEvaluator m_leftImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 23a74460e..b6c93aff9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -31,6 +31,7 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -372,7 +373,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c46a778b5..21ffa2872 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -32,6 +32,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions - 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -264,7 +265,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 2c7ba961c..363df876c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -37,6 +37,8 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; + typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -275,7 +277,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator& left_impl() const { return m_leftImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index bf4a476d9..d5ee0d036 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -104,6 +104,8 @@ struct traits > // From NumDims below. static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; + typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { Flags = 0 @@ -609,7 +611,7 @@ struct TensorContractionEvaluatorBase return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return m_result; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array, ContractDims>& eval_op_indices) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index b29968b63..f2743ffb1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,6 +32,7 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; + typedef typename ::Eigen::internal::TypeConversion::PointerType>::type PointerType; }; template @@ -244,7 +245,7 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the sycl accessor const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 378f5cccb..543b4814b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -231,6 +231,8 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { Flags = 0 @@ -465,7 +467,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } private: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 2e6021b1e..da88bcb3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -300,7 +300,7 @@ struct TensorEvaluator::PointerType data() const { return m_buf; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { // Don't make a local copy of the kernel unless we have to (i.e. it's an diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 7e4c129bb..3fd997278 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -30,6 +30,7 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename traits::PointerType PointerType; }; template @@ -138,7 +139,7 @@ struct TensorEvaluator, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return m_result; } #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } @@ -184,6 +185,8 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; + typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -297,7 +300,7 @@ struct TensorEvaluator::PointerType data() const { return m_result; } #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 82dd1e640..d0c027890 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -32,6 +32,7 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename MakePointer_::Type PointerType; enum { Flags = 0 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 8516b37b3..fcf330b10 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -283,7 +283,7 @@ struct TensorEvaluator, Device> internal::unpacket_traits::size); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator& impl() const { return m_argImpl; } @@ -353,7 +353,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator & impl() const { return m_argImpl; } @@ -433,7 +433,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator& left_impl() const { return m_leftImpl; } /// required by sycl in order to extract the accessor @@ -533,7 +533,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } @@ -625,7 +625,7 @@ struct TensorEvaluator .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator & cond_impl() const { return m_condImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 85dfc7a69..744cf7cc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -38,7 +38,7 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; @@ -89,6 +89,7 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename ::Eigen::internal::TypeConversion::type PointerType; }; template @@ -161,7 +162,11 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename ::Eigen::internal::TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type PointerType; enum { Flags = 0 }; @@ -238,7 +243,11 @@ struct traits::type _Arg3Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - + typedef typename ::Eigen::internal::TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type PointerType; enum { Flags = 0 }; @@ -314,6 +323,9 @@ struct traits > typedef typename ElseXprType::Nested ElseNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typename traits::PointerType, + typename traits::PointerType>::type PointerType; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index f060191ab..10e0a8a6b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -71,6 +71,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename traits::PointerType PointerType; }; template @@ -234,7 +235,7 @@ struct TensorEvaluator, D if (line_len > 1) { const RealScalar pi_over_len(EIGEN_PI / line_len); const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); + std::cos(pi_over_len), std::sin(pi_over_len)); pos_j_base_powered[1] = pos_j_base; if (line_len > 2) { const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index abe85c860..c015ce196 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -38,6 +38,7 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 @@ -143,7 +144,8 @@ struct TensorEvaluator, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Eigen::internal::traits::PointerType data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 2e638992a..7a34f9419 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -22,6 +22,22 @@ template struct MakePointer { typedef T* Type; typedef T& RefType; }; + +namespace internal{ +template struct Pointer_type_promotion { + static const bool val=false; +}; +template struct Pointer_type_promotion { + static const bool val = true; +}; +template struct TypeConversion; +#ifndef __SYCL_DEVICE_ONLY__ +template struct TypeConversion{ + typedef A* type; +}; +#endif +} + #if defined(EIGEN_USE_SYCL) namespace TensorSycl { namespace internal{ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 11ae21be9..fa269b8c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -156,7 +157,7 @@ struct TensorEvaluator, Device> TensorOpCost::MulCost()); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_argImpl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index e1d5541bc..c56e3648a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -38,6 +38,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -423,7 +424,7 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index af6ecf5f4..6147fbdf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -213,7 +214,7 @@ struct TensorEvaluator, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index cd0109ef4..4e384f9b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -46,6 +46,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; }; template @@ -159,7 +160,7 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddd2ca18..329655817 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -31,6 +31,7 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -146,7 +147,7 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return const_cast(m_impl.data()); } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } @@ -214,6 +215,7 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -468,7 +470,7 @@ struct TensorEvaluator, Devi } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { Scalar* result = m_impl.data(); if (result) { Index offset = 0; @@ -633,6 +635,7 @@ struct traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -823,7 +826,7 @@ struct TensorEvaluator::PointerType data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a8e255246..5956e513d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -31,6 +31,7 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -198,7 +199,7 @@ struct TensorEvaluator, Device return cost; } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return NULL; } /// used by sycl EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 6e8e7885b..9e0a20abf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -256,7 +257,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 7356334e1..69079805d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -44,6 +44,7 @@ namespace internal { typedef typename XprType::Nested Nested; static const int NumDimensions = XprTraits::NumDimensions - array_size::value; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; template struct MakePointer { // Intermediate typedef to workaround MSVC issue. @@ -677,7 +678,7 @@ struct TensorEvaluator, } } - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } #if defined(EIGEN_USE_SYCL) const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index e430b0826..14a50a029 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -31,6 +31,7 @@ struct traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -222,7 +223,7 @@ struct TensorEvaluator, Device TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator & impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 8501466ce..2a85ed840 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -24,6 +24,7 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -175,7 +176,7 @@ struct TensorEvaluator, Device> { return internal::ploadt(m_output + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { return m_output; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index edc9dd3f3..0697fd1ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -185,7 +186,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } // required by sycl EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;} diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2237140e7..a7eea99b6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -31,6 +31,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -222,7 +223,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 3d6270614..7b8bd2df7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -34,12 +34,26 @@ struct MakeLocalPointer { namespace Eigen { template class TensorTupleReducerDeviceOp; template struct TensorEvaluator, SyclKernelDevice>; +namespace internal { + +#ifdef __SYCL_DEVICE_ONLY__ +template struct TypeConversion { + template + static typename MakeGlobalPointer::Type get_address_space_pointer(typename MakeGlobalPointer::Type p); + template + static typename MakeLocalPointer::Type get_address_space_pointer(typename MakeLocalPointer::Type p); + + template + static A* get_address_space_pointer(T* p); + typedef decltype(get_address_space_pointer(B())) type; +}; + +#endif +} namespace TensorSycl { namespace internal { template struct GenericKernelReducer; - - /// This struct is used for special expression nodes with no operations (for example assign and selectOP). struct NoOP; @@ -51,10 +65,10 @@ template struct GetType{ }; template struct ValueCondition { - static const size_t Res =X; + static constexpr size_t Res =X; }; -template struct ValueCondition { - static const size_t Res =Y; +template struct ValueCondition { + static constexpr size_t Res =Y; }; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index a1e944e59..006b37921 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -61,6 +61,7 @@ struct traits > typedef T& RefType; }; + typedef typename MakePointer::Type PointerType; }; @@ -81,6 +82,7 @@ struct traits > typedef T& RefType; }; + typedef typename MakePointer::Type PointerType; }; @@ -105,6 +107,7 @@ struct traits > typedef typename MakePointerT::RefType RefType; }; + typedef typename MakePointer::Type PointerType; }; template @@ -121,6 +124,7 @@ struct traits > Options = BaseTraits::Options, Flags = BaseTraits::Flags }; + typedef typename BaseTraits::PointerType PointerType; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index b0f970ae6..a47db0b39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -33,6 +33,7 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; }; template @@ -509,7 +510,7 @@ struct TensorEvaluator, D return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } From c92faf9d841deeecd7e87d9544fab949f7f59ed2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 6 Jul 2017 05:05:57 +0000 Subject: [PATCH 160/680] Merged in mehdi_goli/upstr_benoit/HiperbolicOP (pull request PR-13) Adding hyperbolic operations for sycl. * Adding hyperbolic operations. * Adding the hyperbolic operations for CPU as well. --- Eigen/src/Core/MathFunctions.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 0be4a25da..966a126c9 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1375,9 +1375,19 @@ T acos(const T &x) { return acos(x); } + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acosh(const T &x) { + EIGEN_USING_STD_MATH(acosh); + return acosh(x); +} + #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); } EIGEN_ALWAYS_INLINE double acos(double x) { return cl::sycl::acos(x); } +EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } +EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) #ifdef __CUDACC__ @@ -1395,9 +1405,18 @@ T asin(const T &x) { return asin(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asinh(const T &x) { + EIGEN_USING_STD_MATH(asinh); + return asinh(x); +} + #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); } EIGEN_ALWAYS_INLINE double asin(double x) { return cl::sycl::asin(x); } +EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } +EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) #ifdef __CUDACC__ @@ -1415,9 +1434,18 @@ T atan(const T &x) { return atan(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atanh(const T &x) { + EIGEN_USING_STD_MATH(atanh); + return atanh(x); +} + #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); } EIGEN_ALWAYS_INLINE double atan(double x) { return cl::sycl::atan(x); } +EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } +EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) #ifdef __CUDACC__ From 62b4634ebe7cd7d391e91be812e5c18418db705a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 6 Jul 2017 05:08:13 +0000 Subject: [PATCH 161/680] Merged in mehdi_goli/upstr_benoit/TensorSYCLImageVolumePatchFixed (pull request PR-14) Applying Benoit's comment for Fixing ImageVolumePatch. * Applying Benoit's comment for Fixing ImageVolumePatch. Fixing conflict on cmake file. * Fixing dealocation of the memory in ImagePatch test for SYCL. * Fixing the automerge issue. --- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 60 +++++++++++++------ .../src/Tensor/TensorSyclExprConstructor.h | 8 +-- .../CXX11/src/Tensor/TensorVolumePatch.h | 49 +++++++++++---- .../test/cxx11_tensor_image_patch_sycl.cpp | 4 +- .../test/cxx11_tensor_volume_patch_sycl.cpp | 4 +- 5 files changed, 86 insertions(+), 39 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index c56e3648a..3c6a2e091 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -27,6 +27,7 @@ namespace Eigen { * patch_cols, and 1 for all the additional dimensions. */ namespace internal { + template struct traits > : public traits { @@ -71,12 +72,12 @@ class TensorImagePatchOp : public TensorBase, Device> RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + #ifdef __SYCL_DEVICE_ONLY__ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) + #else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + #endif : m_impl(op.expression(), device) #ifdef EIGEN_USE_SYCL , m_op(op) @@ -429,6 +452,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } #ifdef EIGEN_USE_SYCL + // Required by SYCL in order to construct the expression tree on the device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } #endif @@ -511,10 +535,10 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; TensorEvaluator m_impl; - -#ifdef EIGEN_USE_SYCL - const XprType& m_op; -#endif + #ifdef EIGEN_USE_SYCL + // Required for SYCL in order to construct the expression tree on the device + XprType m_op; + #endif }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 24cc23f45..cbae4ea1d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -448,8 +448,8 @@ struct ExprConstructor, CVQu template \ ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_row_strides, funcD.m_col_strides,\ - funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ - funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value, funcD.m_padding_type, funcD.m_padding_explicit){}\ + funcD.m_in_row_strides, funcD.m_in_col_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, funcD.m_padding_explicit, \ + funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_type, funcD.m_padding_value){}\ }; SYCLTENSORIMAGEPATCHOPEXPR(const) @@ -468,8 +468,8 @@ struct ExprConstructor &t)\ : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.m_patch_planes, funcD.m_patch_rows, funcD.m_patch_cols, funcD.m_plane_strides, funcD.m_row_strides, funcD.m_col_strides,\ funcD.m_in_plane_strides, funcD.m_in_row_strides, funcD.m_in_col_strides,funcD.m_plane_inflate_strides, funcD.m_row_inflate_strides, funcD.m_col_inflate_strides, \ - funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, funcD.m_padding_value,\ - funcD.m_padding_type, funcD.m_padding_explicit){\ + funcD.m_padding_explicit, funcD.m_padding_top_z, funcD.m_padding_bottom_z, funcD.m_padding_top, funcD.m_padding_bottom, funcD.m_padding_left, funcD.m_padding_right, \ + funcD.m_padding_type, funcD.m_padding_value ){\ }\ }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index a47db0b39..51c099591 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -22,6 +22,7 @@ namespace Eigen { * dimensions. */ namespace internal { + template struct traits > : public traits { @@ -34,6 +35,7 @@ struct traits > : public traits static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; typedef typename XprTraits::PointerType PointerType; + }; template @@ -66,12 +68,12 @@ class TensorVolumePatchOp : public TensorBase, D const TensorEvaluator& impl() const { return m_impl; } #ifdef EIGEN_USE_SYCL + // Required by SYCL in order to construct the expression on the device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& xpr() const { return m_op; } #endif @@ -614,8 +635,10 @@ struct TensorEvaluator, D TensorEvaluator m_impl; #ifdef EIGEN_USE_SYCL +// Required by SYCL in order to construct the expression on the device XprType m_op; #endif + }; diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp index e5ca4e388..eea18ec70 100644 --- a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_image_patchOP_sycl +#define EIGEN_TEST_FUNC cxx11_tensor_image_patch_sycl #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -1084,7 +1084,7 @@ test_patch_padding_same_sycl(sycl_device); test_patch_no_extra_dim_sycl(sycl_device); test_imagenet_patches_sycl(sycl_device); } -void test_cxx11_tensor_image_patchOP_sycl() +void test_cxx11_tensor_image_patch_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_tensor_image_patch_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp index ddc9e0d46..039715abc 100644 --- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_volume_patchOP_sycl +#define EIGEN_TEST_FUNC cxx11_tensor_volume_patch_sycl #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -214,7 +214,7 @@ std::cout << "Running on " << s.template get_info( test_single_voxel_patch_sycl(sycl_device); test_entire_volume_patch_sycl(sycl_device); } -void test_cxx11_tensor_volume_patchOP_sycl() +void test_cxx11_tensor_volume_patch_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); From dc524ac7166a603de0dac9e0cfd5aa055dd117ef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 6 Jul 2017 21:11:15 -0700 Subject: [PATCH 162/680] Fixed compilation warning --- .../Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 9dcc9dab7..1264a0270 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -24,9 +24,9 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) - : num_threads_(num_threads), + : env_(env), + num_threads_(num_threads), allow_spinning_(allow_spinning), - env_(env), threads_(num_threads), queues_(num_threads), coprimes_(num_threads), From 6795512e5942b5fd1829f776fde6611a7405b5bf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 6 Jul 2017 21:12:45 -0700 Subject: [PATCH 163/680] Improved the randomness of the tensor random generator --- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 68 ++++++++----------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 1655a813e..f108c349f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -55,11 +55,11 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + *state = current * 6364136223846793005ULL + (stream << 1 | 1); // Generate the random output (using the PCG-XSH-RS scheme) return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); } @@ -73,17 +73,17 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); +T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); return static_cast(rnd); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state) { +Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { Eigen::half result; // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); + unsigned rnd = PCG_XSH_RS_generator(state, stream); result.x = static_cast(rnd & 0x3ffu); // Set the exponent result.x |= (static_cast(15) << 10); @@ -93,14 +93,14 @@ Eigen::half RandomToTypeUniform(uint64_t* state) { template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state) { +float RandomToTypeUniform(uint64_t* state, uint64_t stream) { typedef union { uint32_t raw; float fp; } internal; internal result; // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); + const unsigned rnd = PCG_XSH_RS_generator(state, stream); result.raw = rnd & 0x7fffffu; // Set the exponent result.raw |= (static_cast(127) << 23); @@ -109,7 +109,7 @@ float RandomToTypeUniform(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state) { +double RandomToTypeUniform(uint64_t* state, uint64_t stream) { typedef union { uint64_t raw; double dp; @@ -118,9 +118,9 @@ double RandomToTypeUniform(uint64_t* state) { result.raw = 0; // Generate 52 random bits for the mantissa // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); result.raw = (static_cast(rnd1) << 32) | rnd2; // Set the exponent result.raw |= (static_cast(1023) << 52); @@ -129,14 +129,14 @@ double RandomToTypeUniform(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); } template class UniformRandomGenerator { @@ -155,9 +155,7 @@ template class UniformRandomGenerator { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform(&local_state); - m_state = local_state; + T result = RandomToTypeUniform(&m_state, i); return result; } @@ -165,11 +163,9 @@ template class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&local_state); + values[j] = RandomToTypeUniform(&m_state, i); } - m_state = local_state; return internal::pload(values); } @@ -190,14 +186,14 @@ struct functor_traits > { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { +T RandomToTypeNormal(uint64_t* state, uint64_t stream) { // Use the ratio of uniform method to generate numbers following a normal // distribution. See for example Numerical Recipes chapter 7.3.9 for the // details. T u, v, q; do { - u = RandomToTypeUniform(state); - v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); + u = RandomToTypeUniform(state, stream); + v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); const T x = u - T(0.449871); const T y = numext::abs(v) + T(0.386595); q = x*x + y * (T(0.196)*y - T(0.25472)*x); @@ -208,14 +204,14 @@ T RandomToTypeNormal(uint64_t* state) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); } @@ -234,9 +230,7 @@ template class NormalRandomGenerator { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal(&local_state); - m_state = local_state; + T result = RandomToTypeNormal(&m_state, i); return result; } @@ -244,11 +238,9 @@ template class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&local_state); + values[j] = RandomToTypeNormal(&m_state, i); } - m_state = local_state; return internal::pload(values); } From 9daed6795224ef93719db66b71098bb7ac1a30ec Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 7 Jul 2017 04:18:03 +0000 Subject: [PATCH 164/680] Merged in tntnatbry/eigen (pull request PR-319) Tensor Trace op --- unsupported/Eigen/CXX11/Tensor | 1 + unsupported/Eigen/CXX11/src/Tensor/README.md | 52 ++++ .../Eigen/CXX11/src/Tensor/TensorBase.h | 12 + .../src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorTrace.h | 288 ++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_trace.cpp | 171 +++++++++++ 7 files changed, 526 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h create mode 100644 unsupported/test/cxx11_tensor_trace.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 5d71a9c25..d243fe035 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -141,6 +141,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGenerator.h" #include "src/Tensor/TensorAssign.h" #include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" #include "src/Tensor/TensorSycl.h" #include "src/Tensor/TensorExecutor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 8d989f13a..4423f81f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1168,6 +1168,58 @@ Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` in TensorFunctors.h for information on how to implement a reduction operator. +## Trace + +A *Trace* operation returns a tensor with fewer dimensions than the original +tensor. It returns a tensor whose elements are the sum of the elements of the +original tensor along the main diagonal for a list of specified dimensions, the +"trace dimensions". Similar to the ```Reduction Dimensions```, the trace dimensions +are passed as an input parameter to the operation, are of type ```::Dimensions``` +, and have the same requirements when passed as an input parameter. In addition, +the trace dimensions must have the same size. + +Example: Trace along 2 dimensions. + + // Create a tensor of 3 dimensions + Eigen::Tensor a(2, 2, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); + // Specify the dimensions along which the trace will be computed. + // In this example, the trace can only be computed along the dimensions + // with indices 0 and 1 + Eigen::array dims({0, 1}); + // The output tensor contains all but the trace dimensions. + Tensor a_trace = a.trace(dims); + cout << "a_trace:" << endl; + cout << a_trace << endl; + => + a_trace: + 11 + 13 + 15 + + +### trace(const Dimensions& new_dims) +### trace() + +As a special case, if no parameter is passed to the operation, trace is computed +along *all* dimensions of the input tensor. + +Example: Trace along all dimensions. + + // Create a tensor of 3 dimensions, with all dimensions having the same size. + Eigen::Tensor a(3, 3, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, + {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, + {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); + // Result is a zero dimension tensor + Tensor a_trace = a.trace(); + cout<<"a_trace:"< + a_trace: + 42 + + ## Scan Operations A *Scan* operation returns a tensor with the same dimensions as the original diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 5b1235826..0d6331e9c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -671,6 +671,18 @@ class TensorBase return TensorReductionOp(derived(), dims, reducer); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp + trace(const Dims& dims) const { + return TensorTraceOp(derived(), dims); + } + + const TensorTraceOp, const Derived> + trace() const { + DimensionList in_dims; + return TensorTraceOp, const Derived>(derived(), in_dims); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp broadcast(const Broadcast& broadcast) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 2e638992a..2950e6963 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -70,6 +70,7 @@ template class TensorInflationOp; template class TensorGeneratorOp; template class TensorAssignOp; template class TensorScanOp; +template class TensorTraceOp; template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h new file mode 100644 index 000000000..4b165399f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -0,0 +1,288 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel +// Copyright (C) 2017 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H + +namespace Eigen { + +/** \class TensorTrace + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor Trace class. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorTraceOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorTraceOp type; +}; + +} // end namespace internal + + +template +class TensorTraceOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims) + : m_xpr(expr), m_dims(dims) { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Dims m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorTraceOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device) + { + + EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + + const Dims& op_dims = op.dims(); + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op_dims[i] >= 0); + eigen_assert(op_dims[i] < NumInputDims); + m_reduced[op_dims[i]] = true; + } + + // All the dimensions should be distinct to compute the trace + int num_distinct_reduce_dims = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + ++num_distinct_reduce_dims; + } + } + + eigen_assert(num_distinct_reduce_dims == NumReducedDims); + + // Compute the dimensions of the result. + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + int output_index = 0; + int reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedDims[reduced_index] = input_dims[i]; + if (reduced_index > 0) { + // All the trace dimensions must have the same size + eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]); + } + ++reduced_index; + } + else { + m_dimensions[output_index] = input_dims[i]; + ++output_index; + } + } + + if (NumReducedDims != 0) { + m_traceDim = m_reducedDims[0]; + } + + // Compute the output strides + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } + else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Compute the input strides + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } + else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + output_index = 0; + reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if(m_reduced[i]) { + m_reducedStrides[reduced_index] = input_strides[i]; + ++reduced_index; + } + else { + m_preservedStrides[output_index] = input_strides[i]; + ++output_index; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Initialize the result + CoeffReturnType result = internal::cast(0); + Index index_stride = 0; + for (int i = 0; i < NumReducedDims; ++i) { + index_stride += m_reducedStrides[i]; + } + + // If trace is requested along all dimensions, starting index would be 0 + Index cur_index = 0; + if (NumOutputDims != 0) + cur_index = firstInput(index); + for (Index i = 0; i < m_traceDim; ++i) { + result += m_impl.coeff(cur_index); + cur_index += index_stride; + } + + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType result = internal::ploadt(values); + return result; + } + + protected: + // Given the output index, finds the first index in the input tensor used to compute the trace + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } + else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; + } + + Dimensions m_dimensions; + TensorEvaluator m_impl; + const Device& m_device; + array m_reduced; + array m_reducedDims; + // Initialize the size of the trace dimension + Index m_traceDim = 1; + array m_outputStrides; + array m_reducedStrides; + array m_preservedStrides; +}; + + +} // End namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e639e7056..22647cadd 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -227,6 +227,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_fft) ei_add_test(cxx11_tensor_ifft) ei_add_test(cxx11_tensor_scan) + ei_add_test(cxx11_tensor_trace) endif() diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp new file mode 100644 index 000000000..340d1211c --- /dev/null +++ b/unsupported/test/cxx11_tensor_trace.cpp @@ -0,0 +1,171 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::array; + +template +static void test_0D_trace() { + Tensor tensor; + tensor.setRandom(); + array dims; + Tensor result = tensor.trace(dims); + VERIFY_IS_EQUAL(result(), tensor()); +} + + +template +static void test_all_dimensions_trace() { + Tensor tensor1(5, 5, 5); + tensor1.setRandom(); + Tensor result1 = tensor1.trace(); + VERIFY_IS_EQUAL(result1.rank(), 0); + float sum = 0.0f; + for (int i = 0; i < 5; ++i) { + sum += tensor1(i, i, i); + } + VERIFY_IS_EQUAL(result1(), sum); + + Tensor tensor2(7, 7, 7, 7, 7); + array dims({{2, 1, 0, 3, 4}}); + Tensor result2 = tensor2.trace(dims); + VERIFY_IS_EQUAL(result2.rank(), 0); + sum = 0.0f; + for (int i = 0; i < 7; ++i) { + sum += tensor2(i, i, i, i, i); + } + VERIFY_IS_EQUAL(result2(), sum); +} + + +template +static void test_simple_trace() { + Tensor tensor1(3, 5, 3); + tensor1.setRandom(); + array dims1({{0, 2}}); + Tensor result1 = tensor1.trace(dims1); + VERIFY_IS_EQUAL(result1.rank(), 1); + VERIFY_IS_EQUAL(result1.dimension(0), 5); + float sum = 0.0f; + for (int i = 0; i < 5; ++i) { + sum = 0.0f; + for (int j = 0; j < 3; ++j) { + sum += tensor1(j, i, j); + } + VERIFY_IS_EQUAL(result1(i), sum); + } + + Tensor tensor2(5, 5, 7, 7); + tensor2.setRandom(); + array dims2({{2, 3}}); + Tensor result2 = tensor2.trace(dims2); + VERIFY_IS_EQUAL(result2.rank(), 2); + VERIFY_IS_EQUAL(result2.dimension(0), 5); + VERIFY_IS_EQUAL(result2.dimension(1), 5); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + sum = 0.0f; + for (int k = 0; k < 7; ++k) { + sum += tensor2(i, j, k, k); + } + VERIFY_IS_EQUAL(result2(i, j), sum); + } + } + + array dims3({{1, 0}}); + Tensor result3 = tensor2.trace(dims3); + VERIFY_IS_EQUAL(result3.rank(), 2); + VERIFY_IS_EQUAL(result3.dimension(0), 7); + VERIFY_IS_EQUAL(result3.dimension(1), 7); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + sum = 0.0f; + for (int k = 0; k < 5; ++k) { + sum += tensor2(k, k, i, j); + } + VERIFY_IS_EQUAL(result3(i, j), sum); + } + } + + Tensor tensor3(3, 7, 3, 7, 3); + tensor3.setRandom(); + array dims4({{0, 2, 4}}); + Tensor result4 = tensor3.trace(dims4); + VERIFY_IS_EQUAL(result4.rank(), 2); + VERIFY_IS_EQUAL(result4.dimension(0), 7); + VERIFY_IS_EQUAL(result4.dimension(1), 7); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + sum = 0.0f; + for (int k = 0; k < 3; ++k) { + sum += tensor3(k, i, k, j, k); + } + VERIFY_IS_EQUAL(result4(i, j), sum); + } + } + + Tensor tensor4(3, 7, 4, 7, 5); + tensor4.setRandom(); + array dims5({{1, 3}}); + Tensor result5 = tensor4.trace(dims5); + VERIFY_IS_EQUAL(result5.rank(), 3); + VERIFY_IS_EQUAL(result5.dimension(0), 3); + VERIFY_IS_EQUAL(result5.dimension(1), 4); + VERIFY_IS_EQUAL(result5.dimension(2), 5); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 4; ++j) { + for (int k = 0; k < 5; ++k) { + sum = 0.0f; + for (int l = 0; l < 7; ++l) { + sum += tensor4(i, l, j, l, k); + } + VERIFY_IS_EQUAL(result5(i, j, k), sum); + } + } + } +} + + +template +static void test_trace_in_expr() { + Tensor tensor(2, 3, 5, 3); + tensor.setRandom(); + array dims({{1, 3}}); + Tensor result(2, 5); + result = result.constant(1.0f) - tensor.trace(dims); + VERIFY_IS_EQUAL(result.rank(), 2); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + float sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + sum = 0.0f; + for (int k = 0; k < 3; ++k) { + sum += tensor(i, k, j, k); + } + VERIFY_IS_EQUAL(result(i, j), 1.0f - sum); + } + } +} + + +void test_cxx11_tensor_trace() { + CALL_SUBTEST(test_0D_trace()); + CALL_SUBTEST(test_0D_trace()); + CALL_SUBTEST(test_all_dimensions_trace()); + CALL_SUBTEST(test_all_dimensions_trace()); + CALL_SUBTEST(test_simple_trace()); + CALL_SUBTEST(test_simple_trace()); + CALL_SUBTEST(test_trace_in_expr()); + CALL_SUBTEST(test_trace_in_expr()); +} From 5ac27d5b519d63abeadd2537e2c607271f47536f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 8 Jul 2017 21:58:44 -0700 Subject: [PATCH 165/680] Avoid relying on cxx11 features when possible. --- unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 4b165399f..2b1968de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -101,7 +101,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device) + : m_impl(op.expression(), device), m_traceDim(1), m_device(device) { EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -276,7 +276,7 @@ struct TensorEvaluator, Device> array m_reduced; array m_reducedDims; // Initialize the size of the trace dimension - Index m_traceDim = 1; + Index m_traceDim; array m_outputStrides; array m_reducedStrides; array m_preservedStrides; From 575cda76b33a0169d427885e800cd7e5777bb34c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 9 Jul 2017 11:39:01 -0700 Subject: [PATCH 166/680] Fixed syntax errors generated by xcode --- .../CXX11/src/Tensor/TensorConcatenation.h | 4 +-- .../CXX11/src/Tensor/TensorContraction.h | 2 +- .../CXX11/src/Tensor/TensorConvolution.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 4 +-- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 28 +++++++++---------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 363df876c..a7c1380b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -37,8 +37,8 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; - typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, - typename traits::PointerType, typename traits::PointerType>::type PointerType; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d5ee0d036..e72ddb4a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -104,7 +104,7 @@ struct traits > // From NumDims below. static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; - typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typedef typename conditional::val, typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 543b4814b..6fa51fd64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -231,7 +231,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; - typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typedef typename conditional::val, typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 3fd997278..0e4db46de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -185,7 +185,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; - typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, + typedef typename conditional::val, typename traits::PointerType, typename traits::PointerType>::type PointerType; }; @@ -300,7 +300,7 @@ struct TensorEvaluator::PointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC typename internal::traits::PointerType data() const { return m_result; } #ifdef EIGEN_USE_SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 744cf7cc4..4b6540c07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -89,7 +89,7 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename ::Eigen::internal::TypeConversion::type PointerType; + typedef typename TypeConversion::type PointerType; }; template @@ -162,11 +162,11 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename ::Eigen::internal::TypeConversion::val, - typename traits::PointerType, - typename traits::PointerType>::type - >::type PointerType; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type PointerType; enum { Flags = 0 }; @@ -243,11 +243,11 @@ struct traits::type _Arg3Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename ::Eigen::internal::TypeConversion::val, - typename traits::PointerType, - typename traits::PointerType>::type - >::type PointerType; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type PointerType; enum { Flags = 0 }; @@ -323,9 +323,9 @@ struct traits > typedef typename ElseXprType::Nested ElseNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename conditional<::Eigen::internal::Pointer_type_promotion::val, - typename traits::PointerType, - typename traits::PointerType>::type PointerType; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type PointerType; }; template From f0b154a4b09914a9f11f5801220785f525217b9e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 10 Jul 2017 09:54:09 -0700 Subject: [PATCH 167/680] Code cleanup --- unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index f2743ffb1..182bef918 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,7 +32,7 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; - typedef typename ::Eigen::internal::TypeConversion::PointerType>::type PointerType; + typedef typename TypeConversion::PointerType>::type PointerType; }; template From 84d7be103a2d4234f4aab91a434da85fc39b09d6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 22 Jul 2017 03:19:34 +0000 Subject: [PATCH 168/680] Fixing Argmax that was breaking upstream TensorFlow. --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 8f76b8254..442639868 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -40,7 +40,7 @@ struct traits > : public traits< typedef traits XprTraits; typedef typename XprTraits::StorageKind StorageKind; typedef typename XprTraits::Index Index; - typedef typename XprType::Scalar Scalar; + typedef Index Scalar; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; @@ -58,7 +58,8 @@ class TensorTupleReducerDeviceOp : public TensorBase::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::CoeffReturnType TupleType; + typedef Index CoeffReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerDeviceOp(XprType expr, const Index return_dim, @@ -99,9 +100,9 @@ struct TensorEvaluator, Sy { typedef TensorTupleReducerDeviceOp XprType; typedef typename XprType::Index Index; - typedef typename XprType::Index Scalar; - typedef Index CoeffReturnType; - typedef typename XprType::CoeffReturnType TupleType; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::TupleType TupleType; typedef typename TensorEvaluator::Dimensions Dimensions; enum { From e1e71ca4e49a504f638ae9fe449174425f565196 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Sun, 6 Aug 2017 19:53:18 -0400 Subject: [PATCH 169/680] initial support for z14 --- Eigen/src/Core/arch/ZVector/PacketMath.h | 809 ++++++++++++++--------- 1 file changed, 514 insertions(+), 295 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 57b01fc63..2e5f5774d 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -41,9 +41,14 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +// Z14 has builtin support for float vectors +#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) typedef struct { Packet2d v4f[2]; } Packet4f; +#else +typedef __vector float Packet4f; +#endif typedef union { int32_t i[4]; @@ -51,11 +56,15 @@ typedef union { int64_t l[2]; uint64_t ul[2]; double d[2]; + float f[4]; Packet4i v4i; Packet4ui v4ui; Packet2l v2l; Packet2ul v2ul; Packet2d v2d; +#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH >= 14) + Packet4f v4f; +#endif } Packet; // We don't want to write the same code all the time, but we need to reuse the constants @@ -258,32 +267,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } -/* Helper function to simulate a vec_splat_packet4f - */ -template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) -{ - Packet4f splat; - switch (element) { - case 0: - splat.v4f[0] = vec_splat(from.v4f[0], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 1: - splat.v4f[0] = vec_splat(from.v4f[0], 1); - splat.v4f[1] = splat.v4f[0]; - break; - case 2: - splat.v4f[0] = vec_splat(from.v4f[1], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 3: - splat.v4f[0] = vec_splat(from.v4f[1], 1); - splat.v4f[1] = splat.v4f[0]; - break; - } - return splat; -} - template struct palign_impl { @@ -300,31 +283,6 @@ struct palign_impl } }; -/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double - */ -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); - first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); - break; - case 2: - first.v4f[0] = first.v4f[1]; - first.v4f[1] = second.v4f[0]; - break; - case 3: - first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); - first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); - break; - } - } -}; - - template struct palign_impl { @@ -344,16 +302,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet4f vfrom; - vfrom.v4f[0] = vec_ld2f(&from[0]); - vfrom.v4f[1] = vec_ld2f(&from[2]); - return vfrom; -} - template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -372,15 +320,6 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - vec_st2f(from.v4f[0], &to[0]); - vec_st2f(from.v4f[1], &to[2]); -} - - template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -397,13 +336,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - Packet4f to; - to.v4f[0] = pset1(static_cast(from)); - to.v4f[1] = to.v4f[0]; - return to; -} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -416,17 +348,6 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat_packet4f<0>(a3); - a1 = vec_splat_packet4f<1>(a3); - a2 = vec_splat_packet4f<2>(a3); - a3 = vec_splat_packet4f<3>(a3); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -449,16 +370,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} - template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -477,16 +388,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - pstore((float *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -496,160 +397,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] + b.v4f[0]; - c.v4f[1] = a.v4f[1] + b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] - b.v4f[0]; - c.v4f[1] = a.v4f[1] - b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] * b.v4f[0]; - c.v4f[1] = a.v4f[1] * b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] / b.v4f[0]; - c.v4f[1] = a.v4f[1] / b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - Packet4f c; - c.v4f[0] = -a.v4f[0]; - c.v4f[1] = -a.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ - Packet4f res; - res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); - res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); - res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); - return res; -} -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_round(a.v4f[0]); - res.v4f[1] = vec_round(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_ceil(a.v4f[0]); - res.v4f[1] = vec_ceil(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_floor(a.v4f[0]); - res.v4f[1] = vec_floor(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -659,14 +452,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p = pload(from); - p.v4f[1] = vec_splat(p.v4f[0], 1); - p.v4f[0] = vec_splat(p.v4f[0], 0); - return p; -} - template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -674,15 +459,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -695,23 +477,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - Packet4f rev; - rev.v4f[0] = preverse(a.v4f[1]); - rev.v4f[1] = preverse(a.v4f[0]); - return rev; -} - template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = pabs(a.v4f[0]); - res.v4f[1] = pabs(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -730,13 +497,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet2d sum; - sum = padd(a.v4f[0], a.v4f[1]); - double first = predux(sum); - return static_cast(first); -} template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { @@ -777,21 +537,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) return sum; } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - transpose.packet[2] = vecs[2]; - transpose.packet[3] = vecs[3]; - ptranspose(transpose); - - Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); - sum = padd(sum, transpose.packet[2]); - sum = padd(sum, transpose.packet[3]); - return sum; -} - // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) @@ -806,12 +551,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - // Return predux_mul of the subvectors product - return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); -} - // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -826,14 +565,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet2d b, res; - b = pmin(a.v4f[0], a.v4f[1]); - res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -849,14 +580,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet2d b, res; - b = pmax(a.v4f[0], a.v4f[1]); - res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -877,6 +600,321 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +/* z13 has no vector float support so we emulate that with double + z14 has proper vector float support. +*/ +#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one */ EIGEN_DEVICE_FUNC inline void @@ -915,12 +953,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].v4f[1] = t3.packet[1]; } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; @@ -931,13 +963,200 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); return result; } +#else +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore((float*)af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4f b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd(sum[2], sum[3]); + // Add the results + sum[0] = padd(sum[0], sum[1]); + + return sum[0]; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +#endif + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } + } // end namespace internal } // end namespace Eigen From b223918ea99dcff9f6a3f8d017e7bd79ff4a7db7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 14:12:47 +0200 Subject: [PATCH 170/680] Doc: warn about constness in LLT::solveInPlace --- Eigen/src/Cholesky/LLT.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 152837bfd..814174d47 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -489,6 +489,9 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * This version avoids a copy when the right hand side matrix b is not needed anymore. * + * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. + * This function will const_cast it, so constness isn't honored here. + * * \sa LLT::solve(), MatrixBase::llt() */ template From fc39d5954b72ca2307921beb8a784cd78c2a8d10 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 12:17:37 +0000 Subject: [PATCH 171/680] Merged in dtrebbien/eigen/patch-1 (pull request PR-312) Work around a compilation error seen with nvcc V8.0.61 --- Eigen/src/Core/products/TriangularMatrixMatrix.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 6ec5a8a0b..539b6c0c6 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); From bc91a2df8b9f1c5fa47bfeb9b03c2036890570b5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 15:10:42 +0200 Subject: [PATCH 172/680] bug #1461: fix compilation of Map::x() --- Eigen/src/Geometry/Quaternion.h | 29 +++++++++++++++++------------ test/geo_quaternion.cpp | 13 +++++++++++++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index 3e5a9badb..c3fd8c3e0 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; + typedef typename Coefficients::CoeffReturnType CoeffReturnType; + typedef typename internal::conditional::Flags&LvalueBit), + Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + + enum { Flags = Eigen::internal::traits::Flags }; @@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); } + EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } - /** \returns a reference to the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); } - /** \returns a reference to the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); } - /** \returns a reference to the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); } - /** \returns a reference to the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); } + /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock vec() const { return coeffs().template head<3>(); } diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 96889e722..8ee8fdb27 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -231,6 +231,19 @@ template void mapQuaternion(void){ VERIFY_IS_APPROX(mq3*mq2, q3*q2); VERIFY_IS_APPROX(mcq1*mq2, q1*q2); VERIFY_IS_APPROX(mcq3*mq2, q3*q2); + + // Bug 1461, compilation issue with Map::w(), and other reference/constness checks: + VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum()); + VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum()); + mq3.w() = 1; + const Quaternionx& cq3(q3); + VERIFY( &cq3.x() == &q3.x() ); + const MQuaternionUA& cmq3(mq3); + VERIFY( &cmq3.x() == &mq3.x() ); + // FIXME the following should be ok. The problem is that currently the LValueBit flag + // is used to determine wether we can return a coeff by reference or not, which is not enough for Map. + //const MCQuaternionUA& cmcq3(mcq3); + //VERIFY( &cmcq3.x() == &mcq3.x() ); } template void quaternionAlignment(void){ From bc4dae9aeb84cc3d3114ee496d55654cc7256584 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 15:59:08 +0200 Subject: [PATCH 173/680] bug #1449: fix redux_3 unit test --- test/redux.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/redux.cpp b/test/redux.cpp index 989e1057b..2bade3735 100644 --- a/test/redux.cpp +++ b/test/redux.cpp @@ -9,6 +9,8 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define TEST_ENABLE_TEMPORARY_TRACKING +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +// ^^ see bug 1449 #include "main.h" From 9deee79922c38415125e4d6c2cd34cd05bda7889 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 16:48:07 +0200 Subject: [PATCH 174/680] bug #1457: add setUnit() methods for consistency. --- Eigen/src/Core/CwiseNullaryOp.h | 36 +++++++++++++++++++++++++++++++++ Eigen/src/Core/MatrixBase.h | 2 ++ doc/QuickReference.dox | 10 ++++++++- test/nullary.cpp | 18 +++++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 144608ec2..b1923da0f 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -861,6 +861,42 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } +/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector + * + * \param i index of the unique coefficient to be set to 1 + * + * \only_for_vectors + * + * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setUnit(Index i); + EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index 44f5410db..59d7d05e4 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -261,6 +261,8 @@ x.setIdentity(); Vector3f::UnitX() // 1 0 0 Vector3f::UnitY() // 0 1 0 Vector3f::UnitZ() // 0 0 1 +Vector4f::Unit(i) +x.setUnit(i); \endcode @@ -278,6 +280,7 @@ N/A VectorXf::Unit(size,i) +x.setUnit(size,i); VectorXf::Unit(4,1) == Vector4f(0,1,0,0) == Vector4f::UnitY() \endcode @@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0) - +Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes. +For instance: +\code +MatrixXi M(3,3); +M.setIdentity(); +\endcode \subsection QuickRef_Map Mapping external arrays diff --git a/test/nullary.cpp b/test/nullary.cpp index acd55506e..22ec92352 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -191,6 +191,24 @@ void testVectorType(const VectorType& base) } } } + + // test setUnit() + if(m.size()>0) + { + for(Index k=0; k<10; ++k) + { + Index i = internal::random(0,m.size()-1); + m.setUnit(i); + VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) ); + } + if(VectorType::SizeAtCompileTime==Dynamic) + { + Index i = internal::random(0,2*m.size()-1); + m.setUnit(2*m.size(),i); + VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) ); + } + } + } template From 600e52fc7f574504fa832d67c9d94c991e504bdc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 17:06:57 +0200 Subject: [PATCH 175/680] Add missing scalar conversion --- Eigen/src/Geometry/AngleAxis.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index 0af3c1b08..83ee1be46 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const Quaterni if (n != Scalar(0)) { m_angle = Scalar(2)*atan2(n, abs(q.w())); - if(q.w() < 0) + if(q.w() < Scalar(0)) n = -n; m_axis = q.vec() / n; } From 39864ebe1eb7c8028769cf5d8750faaabce22446 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 Aug 2017 17:18:43 +0200 Subject: [PATCH 176/680] bug #336: improve doc for PlainObjectBase::Map --- Eigen/src/Core/PlainObjectBase.h | 4 ++++ doc/snippets/Matrix_Map_stride.cpp | 7 +++++++ 2 files changed, 11 insertions(+) create mode 100644 doc/snippets/Matrix_Map_stride.cpp diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 77f4f6066..1dc7e223a 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * \a data pointers. * + * Here is an example using strides: + * \include Matrix_Map_stride.cpp + * Output: \verbinclude Matrix_Map_stride.out + * * \see class Map */ //@{ diff --git a/doc/snippets/Matrix_Map_stride.cpp b/doc/snippets/Matrix_Map_stride.cpp new file mode 100644 index 000000000..ae42a127a --- /dev/null +++ b/doc/snippets/Matrix_Map_stride.cpp @@ -0,0 +1,7 @@ +Matrix4i A; +A << 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16; + +std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl; From 12249849b5ef7ec0c64f74440690fb00708b8da6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Aug 2017 10:43:21 +0200 Subject: [PATCH 177/680] Make the threshold from gemm to coeff-based-product configurable, and add some explanations. --- Eigen/src/Core/GeneralProduct.h | 10 ++++++++++ Eigen/src/Core/products/GeneralMatrixMatrix.h | 12 +++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index dec24848d..483277fe6 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -18,6 +18,16 @@ enum { Small = 3 }; +// Define the threshold value to fallback from the generic matrix-matrix product +// implementation (heavy) to the lightweight coeff-based product one. +// See generic_product_impl +// in products/GeneralMatrixMatrix.h for more details. +// TODO This threshold should also be used in the compile-time selector below. +#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD +// This default value has been obtained on a Haswell architecture. +#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 +#endif + namespace internal { template struct product_type_selector; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6440e1d09..ed4d3182b 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -427,7 +427,13 @@ struct generic_product_impl template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy produc is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::evalTo(dst, lhs, rhs); else { @@ -439,7 +445,7 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::addTo(dst, lhs, rhs); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); @@ -448,7 +454,7 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + if((rhs.rows()+dst.rows()+dst.cols())0) lazyproduct::subTo(dst, lhs, rhs); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); From 21633e585b61564159d9cfbfbbad9006b8a09d64 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Aug 2017 11:06:47 +0200 Subject: [PATCH 178/680] bug #1462: remove all occurences of the deprecated __CUDACC_VER__ macro by introducing EIGEN_CUDACC_VER --- Eigen/Core | 10 +++++++++- Eigen/src/Core/arch/CUDA/Half.h | 12 ++++++------ Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 2 +- Eigen/src/Core/util/Macros.h | 7 ++++--- test/cuda_basic.cu | 2 +- unsupported/test/cxx11_tensor_argmax_cuda.cu | 2 +- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 2 +- unsupported/test/cxx11_tensor_complex_cuda.cu | 2 +- .../test/cxx11_tensor_complex_cwise_ops_cuda.cu | 2 +- unsupported/test/cxx11_tensor_contract_cuda.cu | 2 +- unsupported/test/cxx11_tensor_cuda.cu | 2 +- unsupported/test/cxx11_tensor_device.cu | 2 +- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 2 +- unsupported/test/cxx11_tensor_random_cuda.cu | 2 +- unsupported/test/cxx11_tensor_reduction_cuda.cu | 2 +- unsupported/test/cxx11_tensor_scan_cuda.cu | 2 +- 16 files changed, 32 insertions(+), 23 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index f613ee9aa..f6fe4b0ec 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,14 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_CUDACC_VER 0 +#endif + // Handle NVCC/CUDA/SYCL #if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) // Do not try asserts on CUDA and SYCL! @@ -248,7 +256,7 @@ #if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_CUDA #include - #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #if EIGEN_CUDACC_VER >= 70500 #define EIGEN_HAS_CUDA_FP16 #endif #endif diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index e99847e24..8cedd65ad 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -386,7 +386,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hexp(a)); #else return half(::expf(float(a))); @@ -396,7 +396,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return half(::hlog(a)); #else return half(::logf(float(a))); @@ -409,7 +409,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -431,14 +431,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -576,7 +576,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 4a730a0e0..ba6a7f920 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -284,7 +284,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 322e8d899..b63ea2697 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -413,7 +413,7 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -429,7 +429,7 @@ #if defined(EIGEN_CUDACC) // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) +#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ @@ -837,7 +837,8 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) + // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index cb2e4167a..0ff13477d 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -20,7 +20,7 @@ #include #include -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 653443dc5..0dfd6cfe1 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -12,7 +12,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 88c233994..83a740e7a 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index 87cf28920..cbff5a9b2 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -11,7 +11,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index 2baf5eaad..9133fce5a 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -11,7 +11,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index dd68430ce..0b2f3f0f4 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -14,7 +14,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 0ba9d52e9..ad8c9662f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -12,7 +12,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index fde20ddf2..ae21f492a 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 908a5e5a9..0ba7657b8 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index b3be199e1..94d5f4e5a 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index 6858b43a7..fd09d013b 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -12,7 +12,7 @@ #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if dEIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index 5f146f3c9..46571cfea 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -13,7 +13,7 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#if EIGEN_CUDACC_VER >= 70500 #include #endif #include "main.h" From 304ef2957134be386e50592ad7120177c5f3a7c0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 24 Aug 2017 11:26:41 +0200 Subject: [PATCH 179/680] Handle min/max/inf/etc issue in cuda_fp16.h directly in test/main.h --- test/cuda_basic.cu | 3 --- test/main.h | 13 +++++++++++++ unsupported/test/cxx11_tensor_argmax_cuda.cu | 3 --- unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 3 --- unsupported/test/cxx11_tensor_complex_cuda.cu | 3 --- .../test/cxx11_tensor_complex_cwise_ops_cuda.cu | 3 --- unsupported/test/cxx11_tensor_contract_cuda.cu | 3 --- unsupported/test/cxx11_tensor_cuda.cu | 3 --- unsupported/test/cxx11_tensor_device.cu | 3 --- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 3 --- unsupported/test/cxx11_tensor_random_cuda.cu | 3 --- unsupported/test/cxx11_tensor_reduction_cuda.cu | 3 --- unsupported/test/cxx11_tensor_scan_cuda.cu | 3 --- 13 files changed, 13 insertions(+), 36 deletions(-) diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu index 0ff13477d..ce66c2c78 100644 --- a/test/cuda_basic.cu +++ b/test/cuda_basic.cu @@ -20,9 +20,6 @@ #include #include -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include "cuda_common.h" diff --git a/test/main.h b/test/main.h index bd5325196..429c44f81 100644 --- a/test/main.h +++ b/test/main.h @@ -50,6 +50,19 @@ #endif #endif +// Same for cuda_fp16.h +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_TEST_CUDACC_VER 0 +#endif + +#if EIGEN_TEST_CUDACC_VER >= 70500 +#include +#endif + // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 0dfd6cfe1..3d73d491a 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -12,9 +12,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 83a740e7a..816e03220 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index cbff5a9b2..a52350f85 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -11,9 +11,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index 9133fce5a..aac780905 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -11,9 +11,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 0b2f3f0f4..e821ccf0c 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -14,9 +14,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index ad8c9662f..9584a539f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -12,9 +12,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index ae21f492a..cbb43e210 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 0ba7657b8..b3aab0b9d 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index 94d5f4e5a..fa1a46732 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index fd09d013b..ec0669704 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -12,9 +12,6 @@ #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_USE_GPU -#if dEIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index 46571cfea..de1c0ac95 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -13,9 +13,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if EIGEN_CUDACC_VER >= 70500 -#include -#endif #include "main.h" #include From 6d991a9595ffe22a0924bb8140a37355aca4cccb Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Wed, 30 Aug 2017 13:26:30 -0400 Subject: [PATCH 180/680] bug #1464 : Fixes construction of EulerAngles from 3D vector expression. --- unsupported/Eigen/src/EulerAngles/EulerAngles.h | 2 +- unsupported/test/EulerAngles.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h index a5d034d71..e43cdb7fb 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -341,7 +341,7 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d) // set from a vector of Euler angles template - struct eulerangles_assign_impl + struct eulerangles_assign_impl { typedef typename Other::Scalar Scalar; static void run(EulerAngles& e, const Other& vec) diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index 79ee72847..500fb2d17 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -278,6 +278,9 @@ void test_EulerAngles() EulerAnglesXYZd onesEd(1, 1, 1); EulerAnglesXYZf onesEf = onesEd.cast(); VERIFY_IS_APPROX(onesEd, onesEf.cast()); + + // Simple Construction from Vector3 test + VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones())); CALL_SUBTEST_1( eulerangles_manual() ); CALL_SUBTEST_2( eulerangles_manual() ); From a4089991eb6bdb9e8ebfef93d81ca7b5e67ea77d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 31 Aug 2017 02:49:39 +0000 Subject: [PATCH 181/680] Added support for CUDA 9.0. --- Eigen/Core | 1 + Eigen/src/Core/arch/CUDA/Half.h | 58 ++++++++++------- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 3 +- test/half_float.cpp | 38 +++++------ .../CXX11/src/Tensor/TensorContractionCuda.h | 9 +++ .../CXX11/src/Tensor/TensorReductionCuda.h | 63 +++++++++++++------ unsupported/test/cxx11_tensor_argmax_cuda.cu | 6 ++ .../test/cxx11_tensor_cast_float16_cuda.cu | 6 ++ unsupported/test/cxx11_tensor_complex_cuda.cu | 6 ++ .../cxx11_tensor_complex_cwise_ops_cuda.cu | 6 ++ .../test/cxx11_tensor_contract_cuda.cu | 6 ++ unsupported/test/cxx11_tensor_cuda.cu | 6 ++ unsupported/test/cxx11_tensor_device.cu | 6 ++ .../test/cxx11_tensor_of_float16_cuda.cu | 6 ++ unsupported/test/cxx11_tensor_random_cuda.cu | 6 ++ .../test/cxx11_tensor_reduction_cuda.cu | 6 ++ unsupported/test/cxx11_tensor_scan_cuda.cu | 6 ++ 17 files changed, 175 insertions(+), 63 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index f6fe4b0ec..c66359b79 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,7 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +// Starting with CUDA 9 the composite __CUDACC_VER__ is not available. #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) #elif defined(__CUDACC_VER__) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 8cedd65ad..1c557767a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -50,38 +50,45 @@ struct half; namespace half_impl { #if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; - +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; #endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } @@ -280,12 +287,13 @@ union FP32 { float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - return __float2half(ff); + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; @@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { const FP32 f16max = { (127 + 16) << 23 }; const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); @@ -612,11 +620,15 @@ struct hash { // Add the missing shfl_xor intrinsic #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if EIGEN_CUDACC_VER < 90000 return static_cast(__shfl_xor(static_cast(var), laneMask, width)); + #else + return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ba6a7f920..ce48e4b31 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast(&(a))); + *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; return result; } diff --git a/test/half_float.cpp b/test/half_float.cpp index 7b6208873..7734f82cc 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -20,7 +20,7 @@ using Eigen::half; void test_conversion() { - using Eigen::half_impl::__half; + using Eigen::half_impl::__half_raw; // Conversion from float. VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); @@ -37,9 +37,9 @@ void test_conversion() VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); // Verify round-to-nearest-even behavior. - float val1 = float(half(__half(0x3c00))); - float val2 = float(half(__half(0x3c01))); - float val3 = float(half(__half(0x3c02))); + float val1 = float(half(__half_raw(0x3c00))); + float val2 = float(half(__half_raw(0x3c01))); + float val3 = float(half(__half_raw(0x3c02))); VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00); VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02); @@ -55,21 +55,21 @@ void test_conversion() VERIFY_IS_EQUAL(half(true).x, 0x3c00); // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f); - VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f); // Denormals. - VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f); + VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f); // NaNs and infinities. VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half(0xfc00))))); - VERIFY((numext::isnan)(float(half(__half(0xfc01))))); - VERIFY((numext::isinf)(float(half(__half(0x7c00))))); - VERIFY((numext::isnan)(float(half(__half(0x7c01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0xfc00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0xfc01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0x7c00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0x7c01))))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 @@ -79,12 +79,12 @@ void test_conversion() #endif // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half(0x7bff)))); - VERIFY(!(numext::isnan)(half(__half(0x0000)))); - VERIFY((numext::isinf)(half(__half(0xfc00)))); - VERIFY((numext::isnan)(half(__half(0xfc01)))); - VERIFY((numext::isinf)(half(__half(0x7c00)))); - VERIFY((numext::isnan)(half(__half(0x7c01)))); + VERIFY(!(numext::isinf)(half(__half_raw(0x7bff)))); + VERIFY(!(numext::isnan)(half(__half_raw(0x0000)))); + VERIFY((numext::isinf)(half(__half_raw(0xfc00)))); + VERIFY((numext::isnan)(half(__half_raw(0xfc01)))); + VERIFY((numext::isinf)(half(__half_raw(0x7c00)))); + VERIFY((numext::isnan)(half(__half_raw(0x7c01)))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 428b18499..903bc51cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif #define reduceRow(i, mask) \ shuffleInc(i, 0, mask); \ @@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif if((threadIdx.x%8) < 4) { rhs_pf0.y = x1; rhs_pf0.w = x2; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 974eb7deb..ebcbd6f41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) else { assert(0 && "Wordsize not supported"); } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } // We extend atomicExch to support extra data types @@ -104,9 +104,9 @@ template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer&) { #if EIGEN_CUDA_ARCH >= 300 atomicAdd(output, accum); -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num // Let the last block reset the semaphore atomicInc(semaphore, gridDim.x + 1); } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); - } else { - *scratch = reducer.template initializePacket(); + if (gridDim.x == 1) { + if (first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket(); + } } __syncthreads(); } @@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); + #else + int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(scratch, accum, reducer); } - __syncthreads(); - - if (gridDim.x == 1 && first_index == 0) { - half tmp = __low2half(*scratch); - reducer.reduce(__high2half(*scratch), &tmp); - *output = tmp; + if (gridDim.x == 1) { + __syncthreads(); + if (first_index == 0) { + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; + } } } @@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } #ifdef EIGEN_HAS_CUDA_FP16 @@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); + #else + int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize); + int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp1), &reduced_val1); + reducer.reducePacket(*(half2*)(&temp2), &reduced_val2); + #endif } half val1 = __low2half(reduced_val1); diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 3d73d491a..0e8b8125d 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -15,6 +15,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; template diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 816e03220..dabf9e45f 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; void test_cuda_conversion() { diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index a52350f85..d25e1bee1 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -14,6 +14,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index aac780905..4f0f621b4 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -14,6 +14,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; template diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index e821ccf0c..c68287e34 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -17,6 +17,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 9584a539f..d9059a2dc 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -15,6 +15,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index cbb43e210..d5bfeeb39 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -16,6 +16,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; using Eigen::RowMajor; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index b3aab0b9d..c9f3ae1ae 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; template diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index fa1a46732..9d08605fc 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + void test_cuda_random_uniform() { diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index ec0669704..d6ce04f1c 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -15,6 +15,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + template static void test_full_reductions() { diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index de1c0ac95..e99724b91 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include +#endif + using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; From a991c80365372f4cba2caddbd2d4c39352144793 Mon Sep 17 00:00:00 2001 From: Jonas Harsch Date: Fri, 1 Sep 2017 11:30:26 +0000 Subject: [PATCH 182/680] Added an example for a contraction to a scalar value, e.g. a double contraction of two second order tensors and how you can get the value of the result. I lost one day to get this doen so I think it will help some guys. I also added Eigen:: to the IndexPair and and array in the same example. --- unsupported/Eigen/CXX11/src/Tensor/README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 4423f81f7..30d553af7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1016,13 +1016,20 @@ multidimensional case. a.setValues({{1, 2}, {4, 5}, {5, 6}}); // Compute the traditional matrix product - array, 1> product_dims = { IndexPair(1, 0) }; + Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; Eigen::Tensor AB = a.contract(b, product_dims); // Compute the product of the transpose of the matrices - array, 1> transpose_product_dims = { IndexPair(0, 1) }; + Eigen::array, 1> transpose_product_dims = { Eigen::IndexPair(0, 1) }; Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); - + + // Contraction to scalar value using a ouble contraction + // First coordinate of both tensors are contracted as well as both second coordinates + Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; + Eigen::Tensor AdoubleontractedA = a.contract(a, double_contraction_product_dims); + + // Extracting the scalar value of the tensor contraction for further usage + int value = AdoublecontractedA(0); ## Reduction Operations From a34fb212cd7ea96524d859a0c7f989f8657bc068 Mon Sep 17 00:00:00 2001 From: Jonas Harsch Date: Fri, 1 Sep 2017 12:01:39 +0000 Subject: [PATCH 183/680] Close branch JonasMu/added-an-example-for-a-contraction-to-a--1504265366851 From 80142362ac35ca77bfc5ccf7ba49c9f034b57abc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 2 Sep 2017 22:50:20 +0200 Subject: [PATCH 184/680] Fix mixing types in sparse matrix products. --- .../ConservativeSparseSparseProduct.h | 67 ++++++++------- .../SparseSparseProductWithPruning.h | 22 ++--- test/sparse_product.cpp | 84 +++++++++++++++++++ 3 files changed, 132 insertions(+), 41 deletions(-) diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 492eb0a29..9db119b67 100644 --- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -17,7 +17,9 @@ namespace internal { template static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0); + ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); std::memset(mask,0,sizeof(bool)*rows); @@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index nnz = 0; for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); if(!mask[i]) { mask[i] = true; @@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix rhsRow = rhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); - res = resRow; + typedef SparseMatrix RowMajorRhs; + typedef SparseMatrix RowMajorRes; + RowMajorRhs rhsRow = rhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); + res = resRow; } }; @@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix lhsRow = lhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); + typedef SparseMatrix RowMajorLhs; + typedef SparseMatrix RowMajorRes; + RowMajorLhs lhsRow = lhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); res = resRow; } }; @@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol = lhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); + typedef SparseMatrix ColMajorLhs; + typedef SparseMatrix ColMajorRes; + ColMajorLhs lhsCol = lhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); res = resCol; } }; @@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol = rhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); + typedef SparseMatrix ColMajorRhs; + typedef SparseMatrix ColMajorRes; + ColMajorRhs rhsCol = rhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); res = resCol; } }; @@ -263,7 +269,8 @@ namespace internal { template static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, { for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); res.coeffRef(i,j) += x * y; } } @@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol(lhs); - internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); + typedef SparseMatrix ColMajorLhs; + ColMajorLhs lhsCol(lhs); + internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); } }; @@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol(rhs); - internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); + typedef SparseMatrix ColMajorRhs; + ColMajorRhs rhsCol(rhs); + internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); } }; diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 21c419002..88820a48f 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -21,7 +21,8 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res); - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; typedef typename remove_all::type::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. @@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); // allocate a temporary buffer - AmbiVector tempVector(rows); + AmbiVector tempVector(rows); // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) @@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index()) tempVector.restart(); - Scalar x = rhsIt.value(); + RhsScalar x = rhsIt.value(); for (typename evaluator::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) { tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x; } } res.startVec(j); - for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) + for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) res.insertBackByOuterInner(j,it.index()) = it.value(); } res.finalize(); @@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector; template struct sparse_sparse_product_with_pruning_selector { - typedef typename traits::type>::Scalar Scalar; typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) @@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; - typedef SparseMatrix ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixLhs colLhs(lhs); ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, colRhs, res, tolerance); @@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixLhs; + typedef SparseMatrix RowMajorMatrixLhs; RowMajorMatrixLhs rowLhs(lhs); sparse_sparse_product_with_pruning_selector(rowLhs,rhs,res,tolerance); } @@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixRhs; + typedef SparseMatrix RowMajorMatrixRhs; RowMajorMatrixRhs rowRhs(rhs); sparse_sparse_product_with_pruning_selector(lhs,rowRhs,res,tolerance); } @@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(lhs, colRhs, res, tolerance); } @@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixLhs; ColMajorMatrixLhs colLhs(lhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, rhs, res, tolerance); } diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp index 197586741..f47170b72 100644 --- a/test/sparse_product.cpp +++ b/test/sparse_product.cpp @@ -371,6 +371,88 @@ void bug_942() VERIFY_IS_APPROX( ( d.asDiagonal()*cmA ).eval().coeff(0,0), res ); } +template +void test_mixing_types() +{ + typedef std::complex Cplx; + typedef SparseMatrix SpMatReal; + typedef SparseMatrix SpMatCplx; + typedef SparseMatrix SpRowMatCplx; + typedef Matrix DenseMatReal; + typedef Matrix DenseMatCplx; + + Index n = internal::random(1,100); + double density = (std::max)(8./(n*n), 0.2); + + SpMatReal sR1(n,n); + SpMatCplx sC1(n,n), sC2(n,n), sC3(n,n); + SpRowMatCplx sCR(n,n); + DenseMatReal dR1(n,n); + DenseMatCplx dC1(n,n), dC2(n,n), dC3(n,n); + + initSparse(density, dR1, sR1); + initSparse(density, dC1, sC1); + initSparse(density, dC2, sC2); + + VERIFY_IS_APPROX( sC2 = (sR1 * sC1), dC3 = dR1.template cast() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1), dC3 = dC1 * dR1.template cast() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1), dC3 = dR1.template cast().transpose() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast() ); + VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()), dC3 = dR1.template cast() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast().transpose() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast().transpose() ); + + VERIFY_IS_APPROX( sCR = (sR1 * sC1), dC3 = dR1.template cast() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1), dC3 = dC1 * dR1.template cast() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1), dC3 = dR1.template cast().transpose() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast() ); + VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()), dC3 = dR1.template cast() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast().transpose() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast().transpose() ); + + + VERIFY_IS_APPROX( sC2 = (sR1 * sC1).pruned(), dC3 = dR1.template cast() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1).pruned(), dC3 = dC1 * dR1.template cast() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1).pruned(), dC3 = dR1.template cast().transpose() * dC1 ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1).pruned(), dC3 = dC1.transpose() * dR1.template cast() ); + VERIFY_IS_APPROX( sC2 = (sR1 * sC1.transpose()).pruned(), dC3 = dR1.template cast() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1 * sR1.transpose()).pruned(), dC3 = dC1 * dR1.template cast().transpose() ); + VERIFY_IS_APPROX( sC2 = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sC2 = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast().transpose() ); + + VERIFY_IS_APPROX( sCR = (sR1 * sC1).pruned(), dC3 = dR1.template cast() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1).pruned(), dC3 = dC1 * dR1.template cast() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1).pruned(), dC3 = dR1.template cast().transpose() * dC1 ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1).pruned(), dC3 = dC1.transpose() * dR1.template cast() ); + VERIFY_IS_APPROX( sCR = (sR1 * sC1.transpose()).pruned(), dC3 = dR1.template cast() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1 * sR1.transpose()).pruned(), dC3 = dC1 * dR1.template cast().transpose() ); + VERIFY_IS_APPROX( sCR = (sR1.transpose() * sC1.transpose()).pruned(), dC3 = dR1.template cast().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( sCR = (sC1.transpose() * sR1.transpose()).pruned(), dC3 = dC1.transpose() * dR1.template cast().transpose() ); + + + VERIFY_IS_APPROX( dC2 = (sR1 * sC1), dC3 = dR1.template cast() * dC1 ); + VERIFY_IS_APPROX( dC2 = (sC1 * sR1), dC3 = dC1 * dR1.template cast() ); + VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1), dC3 = dR1.template cast().transpose() * dC1 ); + VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1), dC3 = dC1.transpose() * dR1.template cast() ); + VERIFY_IS_APPROX( dC2 = (sR1 * sC1.transpose()), dC3 = dR1.template cast() * dC1.transpose() ); + VERIFY_IS_APPROX( dC2 = (sC1 * sR1.transpose()), dC3 = dC1 * dR1.template cast().transpose() ); + VERIFY_IS_APPROX( dC2 = (sR1.transpose() * sC1.transpose()), dC3 = dR1.template cast().transpose() * dC1.transpose() ); + VERIFY_IS_APPROX( dC2 = (sC1.transpose() * sR1.transpose()), dC3 = dC1.transpose() * dR1.template cast().transpose() ); + + + VERIFY_IS_APPROX( dC2 = dR1 * sC1, dC3 = dR1.template cast() * sC1 ); + VERIFY_IS_APPROX( dC2 = sR1 * dC1, dC3 = sR1.template cast() * dC1 ); + VERIFY_IS_APPROX( dC2 = dC1 * sR1, dC3 = dC1 * sR1.template cast() ); + VERIFY_IS_APPROX( dC2 = sC1 * dR1, dC3 = sC1 * dR1.template cast() ); + + VERIFY_IS_APPROX( dC2 = dR1.row(0) * sC1, dC3 = dR1.template cast().row(0) * sC1 ); + VERIFY_IS_APPROX( dC2 = sR1 * dC1.col(0), dC3 = sR1.template cast() * dC1.col(0) ); + VERIFY_IS_APPROX( dC2 = dC1.row(0) * sR1, dC3 = dC1.row(0) * sR1.template cast() ); + VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast().col(0) ); +} + void test_sparse_product() { for(int i = 0; i < g_repeat; i++) { @@ -381,5 +463,7 @@ void test_sparse_product() CALL_SUBTEST_2( (sparse_product, RowMajor > >()) ); CALL_SUBTEST_3( (sparse_product >()) ); CALL_SUBTEST_4( (sparse_product_regression_test, Matrix >()) ); + + CALL_SUBTEST_5( (test_mixing_types()) ); } } From b35d1ce4a532a40e16927bb303a4c48b2bef1d7c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 6 Sep 2017 10:02:49 +0200 Subject: [PATCH 185/680] Implement true compile-time "if" for apply_rotation_in_the_plane. This fixes a compilation issue for vectorized real type with missing vectorization for complexes, e.g. AVX512. --- Eigen/src/Jacobi/Jacobi.h | 256 +++++++++++++++++++++----------------- 1 file changed, 139 insertions(+), 117 deletions(-) diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 75595517a..af1228cb8 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -309,16 +309,144 @@ inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiR } namespace internal { + +template +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + for(Index i=0; i +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; + typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; + + /*** dynamic-size vectorized paths ***/ + if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + { + // both vectors are sequentially stored in memory => vectorization + enum { Peeling = 2 }; + + Index alignedStart = internal::first_default_aligned(y, size); + Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; + + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + else + { + Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); + for(Index i=alignedStart; i(px); + Packet xi1 = ploadu(px+PacketSize); + Packet yi = pload (py); + Packet yi1 = pload (py+PacketSize); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); + px += Peeling*PacketSize; + py += Peeling*PacketSize; + } + if(alignedEnd!=peelingEnd) + { + Packet xi = ploadu(x+peelingEnd); + Packet yi = pload (y+peelingEnd); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + } + } + + for(Index i=alignedEnd; i0) // FIXME should be compared to the required alignment + { + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + Scalar* EIGEN_RESTRICT px = x; + Scalar* EIGEN_RESTRICT py = y; + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + + /*** non-vectorized path ***/ + else + { + apply_rotation_in_the_plane_selector::run(x,incrx,y,incry,size,c,s); + } + } +}; + template void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - enum { - PacketSize = packet_traits::size, - OtherPacketSize = packet_traits::size - }; - typedef typename packet_traits::type Packet; - typedef typename packet_traits::type OtherPacket; + const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + && (int(packet_traits::size) == int(packet_traits::size)); + eigen_assert(xpr_x.size() == xpr_y.size()); Index size = xpr_x.size(); Index incrx = xpr_x.derived().innerStride(); @@ -332,117 +460,11 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x if (c==OtherScalar(1) && s==OtherScalar(0)) return; - /*** dynamic-size vectorized paths ***/ - - if(VectorX::SizeAtCompileTime == Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - (PacketSize == OtherPacketSize) && - ((incrx==1 && incry==1) || PacketSize == 1)) - { - // both vectors are sequentially stored in memory => vectorization - enum { Peeling = 2 }; - - Index alignedStart = internal::first_default_aligned(y, size); - Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - - const OtherPacket pc = pset1(c); - const OtherPacket ps = pset1(s); - conj_helper::IsComplex,false> pcj; - conj_helper pm; - - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } - else - { - Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); - for(Index i=alignedStart; i(px); - Packet xi1 = ploadu(px+PacketSize); - Packet yi = pload (py); - Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); - px += Peeling*PacketSize; - py += Peeling*PacketSize; - } - if(alignedEnd!=peelingEnd) - { - Packet xi = ploadu(x+peelingEnd); - Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - } - } - - for(Index i=alignedEnd; i::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment - { - const OtherPacket pc = pset1(c); - const OtherPacket ps = pset1(s); - conj_helper::IsComplex,false> pcj; - conj_helper pm; - Scalar* EIGEN_RESTRICT px = x; - Scalar* EIGEN_RESTRICT py = y; - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } - - /*** non-vectorized path ***/ - else - { - for(Index i=0; i::Alignment, evaluator::Alignment), + Vectorizable>::run(x,incrx,y,incry,size,c,s); } } // end namespace internal From 9c353dd1450c362175213c1018b15b24ac840826 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 6 Sep 2017 10:22:47 +0200 Subject: [PATCH 186/680] Add C++11 max_digits10 for half. --- Eigen/src/Core/arch/CUDA/Half.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 1c557767a..ee24e615a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -520,8 +520,8 @@ struct numeric_limits { static const bool is_bounded = false; static const bool is_modulo = false; static const int digits = 11; - static const int digits10 = 2; - //static const int max_digits10 = ; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html static const int radix = 2; static const int min_exponent = -13; static const int min_exponent10 = -4; From ea4e65bf413fd5e60cd13407879d7ccaae62f6fc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 7 Sep 2017 09:13:52 +0000 Subject: [PATCH 187/680] Fixed compilation with cuda_clang. --- Eigen/src/Core/GeneralProduct.h | 1 + Eigen/src/Core/NoAlias.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 483277fe6..694f7cbde 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -396,6 +396,7 @@ template<> struct gemv_dense_selector */ template template +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const MatrixBase &other) const { diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 41fae5096..e94c8ee96 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -99,7 +99,7 @@ class NoAlias * \sa class NoAlias */ template -NoAlias MatrixBase::noalias() +NoAlias EIGEN_DEVICE_FUNC MatrixBase::noalias() { return NoAlias(derived()); } From 6d42309f1394396c984212abad73469e6a50c3a3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Sep 2017 14:34:30 +0200 Subject: [PATCH 188/680] Fix compilation of Vector::operator()(enum) by treating enums as Index --- Eigen/src/Core/util/Macros.h | 10 ++++++++++ Eigen/src/Core/util/XprHelper.h | 10 ++++++++++ Eigen/src/plugins/IndexedViewMethods.h | 10 ++++------ test/indexed_view.cpp | 5 +++++ 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index b63ea2697..caedb0b3a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -410,6 +410,16 @@ #endif #endif +// Does the compiler support type_trais? +#ifndef EIGEN_HAS_TYPE_TRAITS +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || __has_feature(is_enum) || EIGEN_COMP_MSVC >= 1700) +#define EIGEN_HAS_TYPE_TRAITS 1 +#define EIGEN_INCLUDE_TYPE_TRAITS +#else +#define EIGEN_HAS_TYPE_TRAITS 0 +#endif +#endif + // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 4b337f29f..c88f1dbb9 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -34,6 +34,16 @@ inline IndexDest convert_index(const IndexSrc& idx) { return IndexDest(idx); } +// true if T can be considered as an integral index (i.e., and integral type or enum) +template struct is_valid_index_type + : std::integral_constant::value || std::is_enum::value +#else + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible::value +#endif +> {}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 22c1666c5..215bd0530 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -55,9 +55,7 @@ ivcSize(const Indices& indices) const { template struct valid_indexed_view_overload { - // Here we use is_convertible to Index instead of is_integral in order to treat enums as Index. - // In c++11 we could use is_integral && is_enum if is_convertible appears to be too permissive. - enum { value = !(internal::is_convertible::value && internal::is_convertible::value) }; + enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; }; public: @@ -146,7 +144,7 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col template typename internal::enable_if< - IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_integral::value)), + IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), IndexedView::type> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -157,7 +155,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST template typename internal::enable_if< - (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_integral::value)), + (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), IndexedView::type,IvcIndex> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -168,7 +166,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST template typename internal::enable_if< - (internal::get_compile_time_incr::type>::value==1) && (!internal::is_integral::value) && (!Symbolic::is_symbolic::value), + (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!Symbolic::is_symbolic::value), VectorBlock::value> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 7245cf378..8b3082cea 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -366,6 +366,11 @@ void check_indexed_view() VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) ); } + // Check compilation of enums as index type: + enum { X=0, Y=1 }; + a(X) = 1; + A(X,Y) = 1; + } void test_indexed_view() From 94e2213b3823ec1141077a7a7cc4f0b0af11876b Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 8 Sep 2017 15:49:55 -0700 Subject: [PATCH 189/680] Avoid undefined behavior in Eigen::TensorCostModel::numThreads. If the cost is large enough then the thread count can be larger than the maximum representable int, so just casting it to an int is undefined behavior. Contributed by phurst@google.com. --- unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 83c449cf1..ac6717977 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,8 +174,9 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini(threads, GenericNumTraits::highest()); + return numext::mini(max_threads, numext::maxi(1, threads)); } // taskSize assesses parallel task size. From 1b7294f6fc5661ef196acf95f0eafa3242939be1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 8 Sep 2017 16:35:58 -0700 Subject: [PATCH 190/680] Fix cut-and-paste error. --- unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index ac6717977..b148dae39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,6 +174,7 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; // Make sure we don't invoke undefined behavior when we convert to an int. threads = numext::mini(threads, GenericNumTraits::highest()); return numext::mini(max_threads, numext::maxi(1, threads)); From 0c9ad2f52526bd7f07ba41161e68bee7231c05c7 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 14 Sep 2017 19:23:38 +0200 Subject: [PATCH 191/680] std::integral_constant is not C++03 compatible --- Eigen/src/Core/util/XprHelper.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index c88f1dbb9..10328be0d 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -36,14 +36,16 @@ inline IndexDest convert_index(const IndexSrc& idx) { // true if T can be considered as an integral index (i.e., and integral type or enum) template struct is_valid_index_type - : std::integral_constant::value || std::is_enum::value #else // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. internal::is_convertible::value #endif -> {}; + }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar From 23f8b00bc884fe94e94ea273538e5546a4160e4f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 14 Sep 2017 19:26:03 +0200 Subject: [PATCH 192/680] clang provides __has_feature(is_enum) (but not ) in C++03 mode --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index caedb0b3a..f5b071a9c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -412,7 +412,7 @@ // Does the compiler support type_trais? #ifndef EIGEN_HAS_TYPE_TRAITS -#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || __has_feature(is_enum) || EIGEN_COMP_MSVC >= 1700) +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) #define EIGEN_HAS_TYPE_TRAITS 1 #define EIGEN_INCLUDE_TYPE_TRAITS #else From 2062ac995864d51caad71a83bfc267a6aa17f8c3 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 18 Sep 2017 18:17:39 +0100 Subject: [PATCH 193/680] Changes required for new ComputeCpp CE version. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 2b5749f55..6158acbd9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -46,8 +46,8 @@ struct SyclAllocator { namespace Eigen { - #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) - #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) +#define ConvertToActualTypeSycl(Scalar, buf_acc) static_cast(static_cast(((buf_acc.get_pointer().get())))) +#define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar) template class MemCopyFunctor { From 7c9b07dc5ccb2456ecbcdeef6870e7f4a59aba55 Mon Sep 17 00:00:00 2001 From: LaFeuille Date: Wed, 20 Sep 2017 06:38:39 +0000 Subject: [PATCH 194/680] Typo fix alignmeent ->alignment --- Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index f5b071a9c..3a56b0e36 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -696,10 +696,10 @@ namespace Eigen { #if defined(EIGEN_DONT_VECTORIZE) #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 #elif defined(EIGEN_VECTORIZE_AVX512) - // 64 bytes static alignmeent is preferred only if really required + // 64 bytes static alignment is preferred only if really required #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 #elif defined(__AVX__) - // 32 bytes static alignmeent is preferred only if really required + // 32 bytes static alignment is preferred only if really required #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 From 7ad07fc6f2e1dd74554ba576367883c4236c6b98 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Sep 2017 10:22:00 +0200 Subject: [PATCH 195/680] Update documentation for aligned_allocator --- Eigen/src/Core/util/Memory.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 7d9053496..f48d115f4 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -696,7 +696,15 @@ template void swap(scoped_array &a,scoped_array &b) /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code From f92567fecc02f7653c5974ed9b162e49a813dc0c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Sep 2017 10:22:23 +0200 Subject: [PATCH 196/680] Add link to a useful example. --- Eigen/src/plugins/IndexedViewMethods.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 215bd0530..a7ec63adf 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -248,6 +248,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST * * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. * + * See also this question and its answer for an example of how to duplicate coefficients. + * * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) */ template From 8579195169ba046b980b01769edb581b281e0b8a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Sep 2017 09:23:24 +0200 Subject: [PATCH 197/680] bug #1468 (1/2) : add missing std:: to memcpy --- Eigen/QtAlignedMalloc | 2 +- Eigen/src/Core/util/Memory.h | 2 +- Eigen/src/SparseCore/AmbiVector.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc index c6571f129..4f07df02a 100644 --- a/Eigen/QtAlignedMalloc +++ b/Eigen/QtAlignedMalloc @@ -27,7 +27,7 @@ void qFree(void *ptr) void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index f48d115f4..c455f92a1 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -493,7 +493,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - memcpy(target, start, size); + std::memcpy(target, start, size); } }; diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 8a5cc91f2..e0295f2af 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } From 0e85a677e36956f13a3d85047d088d773b39b69b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Sep 2017 10:53:33 +0200 Subject: [PATCH 198/680] bug #1472: fix warning --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 7ca6a9280..c437f1a92 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime + : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); From bc30305d2989443901c8ebe27045c9e996d32a40 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 9 Oct 2017 16:55:10 -0400 Subject: [PATCH 199/680] complete z14 port --- Eigen/src/Core/arch/ZVector/Complex.h | 423 ++++++++++++++------ Eigen/src/Core/arch/ZVector/MathFunctions.h | 15 +- Eigen/src/Core/arch/ZVector/PacketMath.h | 42 +- 3 files changed, 329 insertions(+), 151 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 1bfb73397..3c72968e6 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -15,6 +15,10 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +#endif + static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -29,10 +33,14 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) union { Packet4f v; Packet1cd cd[2]; }; +#else + Packet4f v; +#endif }; template<> struct packet_traits > : default_packet_traits @@ -89,63 +97,27 @@ template<> struct unpacket_traits { typedef std::complex type /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); -template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +/* complex first */ template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); - res.cd[1] = res.cd[0]; - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) { pstore >(to, from); } - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet2cf res; - res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; - res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d a_re, a_im, v1, v2; @@ -163,27 +135,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet2cf res; - res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; - res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) @@ -193,61 +150,20 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac return res; } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); - - return res[0]; -} template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - Packet2cf res; - res.cd[0] = a.cd[1]; - res.cd[1] = a.cd[0]; - return res; -} - template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = padd(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - ptranspose(transpose); - - return padd(transpose.packet[0], transpose.packet[1]); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = pmul(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template struct palign_impl { @@ -258,18 +174,6 @@ struct palign_impl } }; -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) { - first.cd[0] = first.cd[1]; - first.cd[1] = second.cd[0]; - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -303,6 +207,156 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +/* complex follows */ +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; +} + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +#else +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + if((std::ptrdiff_t(&from) % 16) == 0) + res.v = pload((const float *)&from); + else + res.v = ploadu((const float *)&from); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); + return res; +} +#endif + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } + +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd(transpose.packet[0], transpose.packet[1]); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -337,15 +391,6 @@ template<> struct conj_helper }; EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) - -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); -} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -356,11 +401,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return res; } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { Packet2cf res; @@ -369,13 +409,6 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) return res; } -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet1cd tmp = kernel.packet[0].cd[1]; @@ -389,6 +422,136 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); return result; } +#else +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f v1, v2; + + // Permute and multiply the real parts of a and b + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + // Get the imaginary parts of a + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + // multiply a_re * b + v1 = vec_madd(v1, b.v, p4f_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(v2, b.v, p4f_ZERO); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + // permute back to a proper order + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); + + return Packet2cf(padd(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet4f rev_a; + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + return Packet2cf(rev_a); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + Packet4f b; + b = vec_sld(a.v, a.v, 8); + b = padd(a.v, b); + return pfirst(Packet2cf(b)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + Packet4f b1, b2; + b1 = vec_sld(vecs[0].v, vecs[1].v, 8); + b2 = vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = vec_sld(b2, b2, 8); + b2 = padd(b1, b2); + + return Packet2cf(b2); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + Packet4f b; + Packet2cf prod; + b = vec_sld(a.v, a.v, 8); + prod = pmul(a, Packet2cf(b)); + + return pfirst(prod); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = vec_sld(first.v, second.v, 8); + } + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res = conj_helper().pmul(a, b); + Packet4f s = pmul(b.v, b.v); + return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); + return result; +} +#endif } // end namespace internal diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 5c7aa7256..234d82be7 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -96,37 +96,48 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pexp(x); +#else res.v4f[0] = pexp(x.v4f[0]); res.v4f[1] = pexp(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { - return __builtin_s390_vfsqdb(x); + return vec_sqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = vec_sqrt(x); +#else res.v4f[0] = psqrt(x.v4f[0]); res.v4f[1] = psqrt(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pset1(1.0) / psqrt(x); +#else res.v4f[0] = prsqrt(x.v4f[0]); res.v4f[1] = prsqrt(x.v4f[1]); +#endif return res; } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 2e5f5774d..a3294d318 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -17,7 +17,7 @@ namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD @@ -29,7 +29,7 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector int Packet4i; @@ -42,12 +42,12 @@ typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; // Z14 has builtin support for float vectors -#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +typedef __vector float Packet4f; +#else typedef struct { Packet2d v4f[2]; } Packet4f; -#else -typedef __vector float Packet4f; #endif typedef union { @@ -62,7 +62,7 @@ typedef union { Packet2l v2l; Packet2ul v2ul; Packet2d v2d; -#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH >= 14) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) Packet4f v4f; #endif } Packet; @@ -89,7 +89,7 @@ typedef union { Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); @@ -99,6 +99,15 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; +#endif + static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -129,9 +138,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -616,7 +625,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons /* z13 has no vector float support so we emulate that with double z14 has proper vector float support. */ -#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) /* Helper function to simulate a vec_splat_packet4f */ template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) @@ -1041,7 +1050,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } @@ -1052,9 +1060,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { r template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } - -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) @@ -1068,7 +1073,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f b, sum; b = vec_sld(a, a, 8); @@ -1152,10 +1157,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons #endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } } // end namespace internal From 374f750ad4708408a1255a98964719fd598b0659 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 9 Oct 2017 16:56:30 -0400 Subject: [PATCH 200/680] eliminate 'enumeral and non-enumeral type in conditional expression' warning --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 7ca6a9280..169123824 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime + : internal::traits::OuterStrideAtCompileTime != Dynamic ? int(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); From c2a224648919acb27bc208dcc24797345e3a1353 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 10 Oct 2017 13:38:32 -0400 Subject: [PATCH 201/680] fix predux_mul for z14/float --- Eigen/src/Core/arch/ZVector/PacketMath.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index a3294d318..21612ba91 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -1115,7 +1115,9 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { - return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); } // min From 3dcae2a27ffca563ef0a5b78d9d9d27cb05216a8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 09:40:45 -0400 Subject: [PATCH 202/680] initial pexp() for 32-bit floats, commented out due to vec_cts() --- Eigen/src/Core/arch/ZVector/PacketMath.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 21612ba91..a72aea851 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -103,6 +103,12 @@ static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) + static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; @@ -187,7 +193,11 @@ template<> struct packet_traits : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + HasExp = 0, +#else HasExp = 1, +#endif HasSqrt = 1, HasRsqrt = 1, HasRound = 1, From df173f562062843a73454f2eb2479ae1e26dbcdf Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 09:40:49 -0400 Subject: [PATCH 203/680] initial pexp() for 32-bit floats, commented out due to vec_cts() --- Eigen/src/Core/arch/ZVector/MathFunctions.h | 96 +++++++++++++++++++-- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 234d82be7..ff33a975f 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -20,6 +20,50 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#endif + static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -93,16 +137,56 @@ Packet2d pexp(const Packet2d& _x) } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) +Packet4f pexp(const Packet4f& _x) { - Packet4f res; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) - res = pexp(x); +/* + Packet4f x = _x; + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = vec_cts(fx, 0); + emm0 = emm0 + p4i_0x7f; + emm0 = emm0 << reinterpret_cast(p4i_23); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), + isnumber_mask);*/ + return _x; #else - res.v4f[0] = pexp(x.v4f[0]); - res.v4f[1] = pexp(x.v4f[1]); -#endif + Packet4f res; + res.v4f[0] = pexp(_x.v4f[0]); + res.v4f[1] = pexp(_x.v4f[1]); return res; +#endif } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED From d0b7b9d0d321905776326ce99c5c3ff3d48f4ce7 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 10:17:22 -0400 Subject: [PATCH 204/680] some Packet2cf pmul fixes --- Eigen/src/Core/arch/ZVector/Complex.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 3c72968e6..f9e3a480a 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -426,21 +426,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { - Packet4f v1, v2; + Packet4f a_re, a_im, prod, prod_im; // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - // multiply a_re * b - v1 = vec_madd(v1, b.v, p4f_ZERO); + a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); // multiply a_im * b and get the conjugate result - v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + prod_im = a_im * b.v; + prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - - return Packet2cf(padd(v1, v2)); + prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + + // multiply a_re * b, add prod_im + prod = pmadd(a_re, b.v, prod_im); + + return Packet2cf(prod); } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) From 380d41fd76f2e8aeb4f8d4b9138515a05c9d70e3 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 10:40:12 -0400 Subject: [PATCH 205/680] added some extra debugging --- Eigen/src/Core/arch/ZVector/Complex.h | 10 ++++++++++ Eigen/src/Core/arch/ZVector/PacketMath.h | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index f9e3a480a..cd4316687 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -428,18 +428,28 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con { Packet4f a_re, a_im, prod, prod_im; + std::cout << "a = " << a.v << std::endl; + std::cout << ", b = " << b.v << std::endl; // Permute and multiply the real parts of a and b a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + + std::cout << "a_re = " << a_re << std::endl; // Get the imaginary parts of a a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + std::cout << "a_im = " << a_im << std::endl; + // multiply a_im * b and get the conjugate result prod_im = a_im * b.v; + std::cout << "prod_im = " << prod_im << std::endl; prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); + std::cout << "prod_im = " << prod_im << std::endl; // permute back to a proper order prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + std::cout << "prod_im = " << prod_im << std::endl; // multiply a_re * b, add prod_im prod = pmadd(a_re, b.v, prod_im); + std::cout << "prod = " << prod << std::endl; return Packet2cf(prod); } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index a72aea851..0b37f4992 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -286,6 +286,17 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) +{ + Packet vt; + vt.v4f = v; + s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3]; + return s; +} +#endif + + template struct palign_impl { From c4ad358565a1099bc0bdad797c3ca8bd297ea770 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 11:05:29 -0400 Subject: [PATCH 206/680] explicitly set conjugate mask --- Eigen/src/Core/arch/ZVector/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index cd4316687..7504acff9 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); #endif static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; From 98e52cc770ac26fbd29aaa7583443009d7937084 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 12 Oct 2017 15:22:10 -0400 Subject: [PATCH 207/680] rollback 374f750ad4708408a1255a98964719fd598b0659 --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 169123824..7ca6a9280 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? int(internal::traits::OuterStrideAtCompileTime) + : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); From 6c3475f110a017b3872a94a4ef13d0c8b543faca Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 12 Oct 2017 15:34:55 -0400 Subject: [PATCH 208/680] remove debugging --- Eigen/src/Core/arch/ZVector/Complex.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 7504acff9..95aba428f 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -428,28 +428,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con { Packet4f a_re, a_im, prod, prod_im; - std::cout << "a = " << a.v << std::endl; - std::cout << ", b = " << b.v << std::endl; // Permute and multiply the real parts of a and b a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); - std::cout << "a_re = " << a_re << std::endl; // Get the imaginary parts of a a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - std::cout << "a_im = " << a_im << std::endl; // multiply a_im * b and get the conjugate result prod_im = a_im * b.v; - std::cout << "prod_im = " << prod_im << std::endl; prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); - std::cout << "prod_im = " << prod_im << std::endl; // permute back to a proper order prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); - std::cout << "prod_im = " << prod_im << std::endl; // multiply a_re * b, add prod_im prod = pmadd(a_re, b.v, prod_im); - std::cout << "prod = " << prod << std::endl; return Packet2cf(prod); } From 0e6e027e91c134104d3595e5bfcd4a7d2a44a6dd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 12 Oct 2017 15:38:34 -0400 Subject: [PATCH 209/680] check both z13 and z14 arches --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e1648fd1..62bf823c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -271,12 +271,18 @@ if(NOT MSVC) message(STATUS "Enabling NEON in tests/examples") endif() - option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) - if(EIGEN_TEST_ZVECTOR) + option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z13) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector") message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") endif() + option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z14) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") + message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") + endif() + check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP) option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF) From f349507e02b7623d439d90006289d0d74a75898d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 13 Oct 2017 15:58:12 -0700 Subject: [PATCH 210/680] Specialize ThreadPoolDevice::enqueueNotification for the case with no args. As an example this reduces binary size of an TensorFlow demo app for Android by about 2.5%. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 16180ca69..ec6802e85 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -154,7 +154,11 @@ struct ThreadPoolDevice { template EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(f, args...)); + } else { + pool_->Schedule(f); + } } // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if From 4245475d22f35572199a05a8119e8caf9f3e6c36 Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Fri, 20 Oct 2017 03:20:13 +0000 Subject: [PATCH 211/680] Fixing missing inlines on device functions for newer CUDA cards --- Eigen/src/Core/arch/CUDA/Half.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index ee24e615a..666b23c6e 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -154,55 +154,55 @@ namespace half_impl { // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -__device__ half operator + (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return __hadd(a, b); } -__device__ half operator * (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { return __hmul(a, b); } -__device__ half operator - (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { return __hsub(a, b); } -__device__ half operator / (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); } -__device__ half operator - (const half& a) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { return __hneg(a); } -__device__ half& operator += (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { a = a + b; return a; } -__device__ half& operator *= (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { a = a * b; return a; } -__device__ half& operator -= (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { a = a - b; return a; } -__device__ half& operator /= (half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { a = a / b; return a; } -__device__ bool operator == (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { return __heq(a, b); } -__device__ bool operator != (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { return __hne(a, b); } -__device__ bool operator < (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return __hlt(a, b); } -__device__ bool operator <= (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { return __hle(a, b); } -__device__ bool operator > (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { return __hgt(a, b); } -__device__ bool operator >= (const half& a, const half& b) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { return __hge(a, b); } @@ -619,7 +619,7 @@ struct hash { // Add the missing shfl_xor intrinsic #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 -__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { #if EIGEN_CUDACC_VER < 90000 return static_cast(__shfl_xor(static_cast(var), laneMask, width)); #else From 9bb26eb8f1438d7856ae6c37cd5ba2c4414605fb Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Sat, 21 Oct 2017 00:50:38 +0000 Subject: [PATCH 212/680] Restore `__device__` --- Eigen/src/Core/arch/CUDA/Half.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 666b23c6e..bfda39df5 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -154,55 +154,55 @@ namespace half_impl { // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { return __hadd(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { return __hmul(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { return __hneg(a); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { a = a + b; return a; } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { a = a * b; return a; } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { a = a - b; return a; } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { a = a / b; return a; } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { return __heq(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { return __hne(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { return __hlt(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { return __hle(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { return __hgt(a, b); } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { return __hge(a, b); } @@ -619,7 +619,7 @@ struct hash { // Add the missing shfl_xor intrinsic #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { #if EIGEN_CUDACC_VER < 90000 return static_cast(__shfl_xor(static_cast(var), laneMask, width)); #else From a6d875bac8df04844781cc5a50c5063c8d920478 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sun, 22 Oct 2017 08:12:45 -0700 Subject: [PATCH 213/680] Removed unecesasry #include --- unsupported/test/cxx11_tensor_argmax_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_complex_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_contract_cuda.cu | 5 ----- unsupported/test/cxx11_tensor_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_device.cu | 5 ----- unsupported/test/cxx11_tensor_of_float16_cuda.cu | 5 ----- unsupported/test/cxx11_tensor_random_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_reduction_cuda.cu | 6 ------ unsupported/test/cxx11_tensor_scan_cuda.cu | 5 ----- 11 files changed, 62 deletions(-) diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 0e8b8125d..3d73d491a 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -15,12 +15,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - using Eigen::Tensor; template diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index dabf9e45f..816e03220 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -16,12 +16,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - using Eigen::Tensor; void test_cuda_conversion() { diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index d25e1bee1..a52350f85 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -14,12 +14,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index 4f0f621b4..aac780905 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -14,12 +14,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - using Eigen::Tensor; template diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index c68287e34..3621e2aa6 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -17,11 +17,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index d9059a2dc..9584a539f 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -15,12 +15,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index d5bfeeb39..7c14bc187 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -16,11 +16,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif using Eigen::Tensor; using Eigen::RowMajor; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index c9f3ae1ae..167b75d25 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -16,11 +16,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif using Eigen::Tensor; diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index 9d08605fc..fa1a46732 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -16,12 +16,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - void test_cuda_random_uniform() { diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index d6ce04f1c..ec0669704 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -15,12 +15,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif - template static void test_full_reductions() { diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index e99724b91..1d4edef11 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -16,11 +16,6 @@ #include "main.h" #include -// The EIGEN_CUDACC_VER macro is provided by -// unsupported/Eigen/CXX11/Tensor included above -#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 -#include -#endif using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; From 11ddac57e5fb7ed8b377b58ca955689bb637afe2 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 23 Oct 2017 13:22:22 +0000 Subject: [PATCH 214/680] Merged in guillaume_michel/eigen (pull request PR-334) - Add support for NEON plog PacketMath function --- Eigen/src/Core/arch/NEON/MathFunctions.h | 92 ++++++++++++++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index 6bb05bb92..c48c61023 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -84,6 +84,98 @@ Packet4f pexp(const Packet4f& _x) return y; } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + + _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000); + + /* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 + */ + _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + Packet4i ux = vreinterpretq_s32_f32(x); + + Packet4i emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, p4i_inv_mant_mask); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half)); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, p4i_0x7f); + Packet4f e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, p4f_1); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF); + Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, p4f_1); + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask))); + x = vaddq_f32(x, tmp); + + Packet4f z = vmulq_f32(x,x); + + Packet4f y = p4f_cephes_log_p0; + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p5); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p6); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p7); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p8); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + tmp = vmulq_f32(e, p4f_cephes_log_q1); + y = vaddq_f32(y, tmp); + + + tmp = vmulq_f32(z, p4f_half); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, p4f_cephes_log_q2); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 836fbc0dd..6283b6cfa 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -81,7 +81,7 @@ template<> struct packet_traits : default_packet_traits // FIXME check the Has* HasSin = 0, HasCos = 0, - HasLog = 0, + HasLog = 1, HasExp = 1, HasSqrt = 0 }; From e8468ea91b45e6b09e1a58626a78fd723da9b64f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Nov 2017 10:24:28 +0100 Subject: [PATCH 215/680] Fix overflow issues in BDCSVD --- Eigen/src/SVD/BDCSVD.h | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index b8c41c560..effa643fd 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato // Copyright (C) 2013 Pierre Zoppitelli // Copyright (C) 2013 Jitse Niesen -// Copyright (C) 2014-2016 Gael Guennebaud +// Copyright (C) 2014-2017 Gael Guennebaud // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -696,7 +696,9 @@ typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar for(Index i=0; i::computeSingVals(const ArrayRef& col0, const ArrayRef& d { using std::abs; using std::swap; + using std::sqrt; Index n = col0.size(); Index actual_n = n; + // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above + // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value. while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; for (Index k = 0; k < n; ++k) @@ -732,7 +737,9 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d right = (diag(actual_n-1) + col0.matrix().norm()); else { - // Skip deflated singular values + // Skip deflated singular values, + // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside. + // This should be equivalent to using perm[] Index l = k+1; while(col0(l)==Literal(0)) { ++l; eigen_internal_assert(l::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = (std::numeric_limits::min)(); + // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), + // the factor 2 is to be more conservative + leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); + + // check that we did it right: + eigen_internal_assert( (std::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); // I don't understand why the case k==0 would be special there: - // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe + // if (k == 0) rightShifted = right - left; else + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * RealScalar(0.6); - rightShifted = -(std::numeric_limits::min)(); + leftShifted = -(right - left) * RealScalar(0.51); + if(k+1::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); + else + rightShifted = -(std::numeric_limits::min)(); } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); @@ -980,7 +995,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index Index start = firstCol + shift; RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); - RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s)); + RealScalar r = numext::hypot(c,s); if (r == Literal(0)) { m_computed(start+i, start+i) = Literal(0); From e9d2888e74f9afaab16e2b5ec5264d44613ff366 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Nov 2017 10:26:03 +0100 Subject: [PATCH 216/680] Improve debugging tests and output in BDCSVD --- Eigen/src/SVD/BDCSVD.h | 131 ++++++++++++++++++++++++++++++++--------- 1 file changed, 102 insertions(+), 29 deletions(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index effa643fd..b440901d6 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -22,6 +22,11 @@ // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS +#ifdef EIGEN_BDCSVD_SANITY_CHECKS +#undef eigen_internal_assert +#define eigen_internal_assert(X) assert(X); +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -591,7 +596,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -618,13 +623,11 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec std::cout << " shift: " << shifts.transpose() << "\n"; { - Index actual_n = n; - while(actual_n>1 && abs(col0(actual_n-1))= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n"; - std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n"; + assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); } #endif @@ -652,13 +655,13 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(U.allFinite()); - assert(V.allFinite()); - assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n); - assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n); assert(m_naiveU.allFinite()); assert(m_naiveV.allFinite()); assert(m_computed.allFinite()); + assert(U.allFinite()); + assert(V.allFinite()); +// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); +// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif // Because of deflation, the singular values might not be completely sorted. @@ -673,6 +676,15 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec if(m_compV) V.col(i).swap(V.col(i+1)); } } + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + { + bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); + if(!singular_values_sorted) + std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; + assert(singular_values_sorted); + } +#endif // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end @@ -749,19 +761,22 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar mid = left + (right-left) / Literal(2); RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0)); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << right-left << "\n"; - std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right) << "\n"; - std::cout << " = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.2*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.3*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.4*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.49*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.5*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.51*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.6*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.7*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0) - << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n"; + std::cout << "right-left = " << right-left << "\n"; +// std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left) +// << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right) << "\n"; + std::cout << " = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.1) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.2) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.3) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.4) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.49) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.5) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.51) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.6) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.7) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.8) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.9) *(right-left), col0, diag, perm, diag, 0) + << " " << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n"; #endif RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; @@ -804,13 +819,16 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // And find mu such that f(mu)==0: RealScalar muZero = -a/b; RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((std::isfinite)(fZero)); +#endif muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; - if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; @@ -843,18 +861,30 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d else rightShifted = -(std::numeric_limits::min)(); } - + RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); -#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE +#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift); #endif +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(!(std::isfinite)(fLeft)) + std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n"; + assert((std::isfinite)(fLeft)); + + if(!(std::isfinite)(fRight)) + std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n"; +// assert((std::isfinite)(fRight)); +#endif + #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(!(fLeft * fRight<0)) { - std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose() << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n"; - std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + std::cout << "f(leftShifted) using leftShifted=" << leftShifted << " ; diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; " + << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n"; + std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " + << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) << " == " << secularEq(right, col0, diag, perm, diag, 0) << "\n"; } #endif eigen_internal_assert(fLeft * fRight < Literal(0)); @@ -863,6 +893,8 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d { RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert((numext::isfinite)(fMid)); + if (fLeft * fMid < Literal(0)) { rightShifted = midShifted; @@ -881,6 +913,15 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d shifts[k] = shift; mus[k] = muCur; +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE + if(k+1=singVals[k-1]); + assert(singVals[k]>=diag(k)); +#endif + // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - @@ -915,14 +956,42 @@ void BDCSVD::perturbCol0 // see equation (3.6) RealScalar dk = diag(k); RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk)); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(prod<0) { + std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n"; + std::cout << "prod = " << "(" << singVals(last) << " + " << dk << ") * (" << mus(last) << " + (" << shifts(last) << " - " << dk << "))" << "\n"; + std::cout << " = " << singVals(last) + dk << " * " << mus(last) + (shifts(last) - dk) << "\n"; + } + assert(prod>=0); +#endif for(Index l = 0; l=k && (l==0 || l-1>=m)) + { + std::cout << "Error in perturbCol0\n"; + std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n"; + std::cout << " " <=0); +#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) @@ -934,6 +1003,9 @@ void BDCSVD::perturbCol0 std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((std::isfinite)(tmp)); +#endif zhat(k) = col0(k) > Literal(0) ? tmp : -tmp; } } @@ -1043,7 +1115,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi } c/=r; s/=r; - m_computed(firstColm + i, firstColm) = r; + m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); m_computed(firstColm + j, firstColm) = Literal(0); @@ -1117,6 +1189,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; + std::cout << " : " << col0.transpose() << "\n\n"; #endif { // Check for total deflation @@ -1207,7 +1280,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*diag(i) << "\n"; + std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*/*diag(i)*/maxDiag << "\n"; #endif eigen_internal_assert(abs(diag(i) - diag(i-1)) Date: Wed, 8 Nov 2017 23:28:01 +0100 Subject: [PATCH 217/680] Fix issue with boost::multiprec in previous commit --- Eigen/src/SVD/BDCSVD.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index b440901d6..0abd4c1bb 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -845,10 +845,10 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d { // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), // the factor 2 is to be more conservative - leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); + leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); // check that we did it right: - eigen_internal_assert( (std::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); + eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); // I don't understand why the case k==0 would be special there: // if (k == 0) rightShifted = right - left; else rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe @@ -857,7 +857,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d { leftShifted = -(right - left) * RealScalar(0.51); if(k+1::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); + rightShifted = -numext::maxi( (std::numeric_limits::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); else rightShifted = -(std::numeric_limits::min)(); } From 912e9965ef64e97ed496e04b5c2a3376a21bcfd1 Mon Sep 17 00:00:00 2001 From: zhouzhaoping Date: Thu, 9 Nov 2017 08:49:01 +0000 Subject: [PATCH 218/680] a small mistake QuickReference.dox edited online with Bitbucket --- doc/QuickReference.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index 59d7d05e4..18c90a2a9 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -68,7 +68,7 @@ Array <=> Array4f Conversion between the matrix and array worlds: \code -Array44f a1, a1; +Array44f a1, a2; Matrix4f m1, m2; m1 = a1 * a2; // coeffwise product, implicit conversion from array to matrix. a1 = m1 * m2; // matrix product, implicit conversion from matrix to array. From 42a83346683861e7a58e8c9aa407eb001d56befa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 4 Jan 2018 16:01:01 +0800 Subject: [PATCH 219/680] ENH: exp supports complex type for cuda --- Eigen/src/Core/MathFunctions.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 5ba5293a0..1b864a405 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1289,6 +1289,22 @@ float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + auto com = ::expf(x.real()); + auto res_real = com * ::cosf(x.imag()); + auto res_imag = com * ::sinf(x.imag()); + return std::complex(res_real, res_imag); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + auto com = ::exp(x.real()); + auto res_real = com * ::cos(x.imag()); + auto res_imag = com * ::sin(x.imag()); + return std::complex(res_real, res_imag); +} #endif template From 0c57be407d790c6013e9bad7c8f0123f4689d172 Mon Sep 17 00:00:00 2001 From: Daniel Trebbien Date: Sun, 18 Feb 2018 15:35:45 -0800 Subject: [PATCH 220/680] Move up the specialization of std::numeric_limits This fixes a compilation error seen when building TensorFlow on macOS: https://github.com/tensorflow/tensorflow/issues/17067 --- Eigen/src/Core/arch/CUDA/Half.h | 97 ++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 43 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index bfda39df5..2bb3d0d9b 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -145,6 +145,60 @@ struct half : public half_impl::half_base { } }; +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + namespace half_impl { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 @@ -501,49 +555,6 @@ template<> struct is_arithmetic { enum { value = true }; }; } // end namespace internal -} // end namespace Eigen - -namespace std { -template<> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } - static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } - static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } - static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } - static Eigen::half round_error() { return Eigen::half(0.5); } - static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } - static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } - static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } -}; -} - -namespace Eigen { - template<> struct NumTraits : GenericNumTraits { From d820ab9edc0b38af4cdb3d545714a0c9083e5a78 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Mar 2018 09:33:43 +0100 Subject: [PATCH 221/680] Add static assertion on selfadjoint-view's UpLo parameter. --- Eigen/src/Core/SelfAdjointView.h | 4 +++- Eigen/src/Core/util/StaticAssert.h | 3 ++- test/selfadjoint.cpp | 4 ++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 7e71fe3c0..be219726b 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -71,7 +71,9 @@ template class SelfAdjointView EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) - {} + { + EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); + } EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index cb1678900..987d5fce4 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -101,7 +101,8 @@ THIS_TYPE_IS_NOT_SUPPORTED=1, STORAGE_KIND_MUST_MATCH=1, STORAGE_INDEX_MUST_MATCH=1, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1 + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 }; }; diff --git a/test/selfadjoint.cpp b/test/selfadjoint.cpp index 92401e506..5ac219352 100644 --- a/test/selfadjoint.cpp +++ b/test/selfadjoint.cpp @@ -7,6 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#define EIGEN_NO_STATIC_ASSERT #include "main.h" // This file tests the basic selfadjointView API, @@ -45,6 +46,9 @@ template void selfadjoint(const MatrixType& m) m4 = m2; m4 -= m1.template selfadjointView(); VERIFY_IS_APPROX(m4, m2-m3); + + VERIFY_RAISES_ASSERT(m2.template selfadjointView()); + VERIFY_RAISES_ASSERT(m2.template selfadjointView()); } void bug_159() From f6be7289d703ea608db01bfae1fb41cb93465363 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Mar 2018 10:00:51 +0100 Subject: [PATCH 222/680] Implement better static assertion checking to make sure that the first assertion is a static one and not a runtime one. --- Eigen/src/Core/util/StaticAssert.h | 3 ++- test/main.h | 37 ++++++++++++++++++++++++++++-- test/selfadjoint.cpp | 6 ++--- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 987d5fce4..500e47792 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -24,6 +24,7 @@ * */ +#ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) @@ -132,7 +133,7 @@ #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); #endif // EIGEN_NO_STATIC_ASSERT - +#endif // EIGEN_STATIC_ASSERT // static assertion failing if the type \a TYPE is not a vector type #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \ diff --git a/test/main.h b/test/main.h index 429c44f81..6079cbd06 100644 --- a/test/main.h +++ b/test/main.h @@ -175,6 +175,12 @@ namespace Eigen eigen_assert_exception(void) {} ~eigen_assert_exception() { Eigen::no_more_assert = false; } }; + + struct eigen_static_assert_exception + { + eigen_static_assert_exception(void) {} + ~eigen_static_assert_exception() { Eigen::no_more_assert = false; } + }; } // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while // one should have been, then the list of excecuted assertions is printed out. @@ -238,6 +244,7 @@ namespace Eigen else \ EIGEN_THROW_X(Eigen::eigen_assert_exception()); \ } + #ifdef EIGEN_EXCEPTIONS #define VERIFY_RAISES_ASSERT(a) { \ Eigen::no_more_assert = false; \ @@ -249,13 +256,39 @@ namespace Eigen catch (Eigen::eigen_assert_exception&) { VERIFY(true); } \ Eigen::report_on_cerr_on_assert_failure = true; \ } - #endif //EIGEN_EXCEPTIONS + #endif // EIGEN_EXCEPTIONS #endif // EIGEN_DEBUG_ASSERTS + #if defined(TEST_CHECK_STATIC_ASSERTIONS) && defined(EIGEN_EXCEPTIONS) + #define EIGEN_STATIC_ASSERT(a,MSG) \ + if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ + { \ + Eigen::no_more_assert = true; \ + if(report_on_cerr_on_assert_failure) \ + eigen_plain_assert(a && #MSG); \ + else \ + EIGEN_THROW_X(Eigen::eigen_static_assert_exception()); \ + } + #define VERIFY_RAISES_STATIC_ASSERT(a) { \ + Eigen::no_more_assert = false; \ + Eigen::report_on_cerr_on_assert_failure = false; \ + try { \ + a; \ + VERIFY(Eigen::should_raise_an_assert && # a); \ + } \ + catch (Eigen::eigen_static_assert_exception&) { VERIFY(true); } \ + Eigen::report_on_cerr_on_assert_failure = true; \ + } + #endif // TEST_CHECK_STATIC_ASSERTIONS + #ifndef VERIFY_RAISES_ASSERT #define VERIFY_RAISES_ASSERT(a) \ std::cout << "Can't VERIFY_RAISES_ASSERT( " #a " ) with exceptions disabled\n"; #endif +#ifndef VERIFY_RAISES_STATIC_ASSERT + #define VERIFY_RAISES_STATIC_ASSERT(a) \ + std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; +#endif #if !defined(__CUDACC__) #define EIGEN_USE_CUSTOM_ASSERT @@ -264,10 +297,10 @@ namespace Eigen #else // EIGEN_NO_ASSERTION_CHECKING #define VERIFY_RAISES_ASSERT(a) {} + #define VERIFY_RAISES_STATIC_ASSERT(a) {} #endif // EIGEN_NO_ASSERTION_CHECKING - #define EIGEN_INTERNAL_DEBUGGING #include // required for createRandomPIMatrixOfRank diff --git a/test/selfadjoint.cpp b/test/selfadjoint.cpp index 5ac219352..aaa4888bd 100644 --- a/test/selfadjoint.cpp +++ b/test/selfadjoint.cpp @@ -7,7 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#define EIGEN_NO_STATIC_ASSERT +#define TEST_CHECK_STATIC_ASSERTIONS #include "main.h" // This file tests the basic selfadjointView API, @@ -47,8 +47,8 @@ template void selfadjoint(const MatrixType& m) m4 -= m1.template selfadjointView(); VERIFY_IS_APPROX(m4, m2-m3); - VERIFY_RAISES_ASSERT(m2.template selfadjointView()); - VERIFY_RAISES_ASSERT(m2.template selfadjointView()); + VERIFY_RAISES_STATIC_ASSERT(m2.template selfadjointView()); + VERIFY_RAISES_STATIC_ASSERT(m2.template selfadjointView()); } void bug_159() From f7d17689a5e4609418c55ec47d7e5924e2785460 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Mar 2018 10:11:13 +0100 Subject: [PATCH 223/680] Add static assertion for fixed sizes Ref<> --- Eigen/src/Core/Ref.h | 2 ++ test/ref.cpp | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index abb1e5121..ac9502bc4 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -95,6 +95,8 @@ protected: template EIGEN_DEVICE_FUNC void construct(Expression& expr) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression); + if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); diff --git a/test/ref.cpp b/test/ref.cpp index 769db0414..9dd2c0449 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -13,7 +13,7 @@ #endif #define TEST_ENABLE_TEMPORARY_TRACKING - +#define TEST_CHECK_STATIC_ASSERTIONS #include "main.h" // test Ref.h @@ -255,6 +255,17 @@ void test_ref_overloads() test_ref_ambiguous(A, B); } +void test_ref_fixed_size_assert() +{ + Vector4f v4; + VectorXf vx(10); + VERIFY_RAISES_STATIC_ASSERT( Ref y = v4; (void)y; ); + VERIFY_RAISES_STATIC_ASSERT( Ref y = vx.head<4>(); (void)y; ); + VERIFY_RAISES_STATIC_ASSERT( Ref y = v4; (void)y; ); + VERIFY_RAISES_STATIC_ASSERT( Ref y = vx.head<4>(); (void)y; ); + VERIFY_RAISES_STATIC_ASSERT( Ref y = 2*v4; (void)y; ); +} + void test_ref() { for(int i = 0; i < g_repeat; i++) { @@ -277,4 +288,5 @@ void test_ref() } CALL_SUBTEST_7( test_ref_overloads() ); + CALL_SUBTEST_7( test_ref_fixed_size_assert() ); } From e900b010c899b04b1710c5e99486b52494a40ea1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 19 Mar 2018 09:04:54 -0700 Subject: [PATCH 224/680] Improve robustness of igamma and igammac to bad inputs. Check for nan inputs and propagate them immediately. Limit the number of internal iterations to 2000 (same number as used by scipy.special.gammainc). This prevents an infinite loop when the function is called with nan or very large arguments. Original change by mfirgunov@google.com --- .../src/SpecialFunctions/SpecialFunctionsImpl.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 5d1b8fcc2..7deca3e12 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -535,6 +535,10 @@ struct igammac_impl { return nan; } + if (numext::isnan(a) || numext::isnan(x)) { // propagate nans + return nan; + } + if ((x < one) || (x < a)) { /* The checks above ensure that we meet the preconditions for * igamma_impl::Impl(), so call it, rather than igamma_impl::Run(). @@ -592,7 +596,7 @@ struct igammac_impl { qkm1 = z * x; ans = pkm1 / qkm1; - while (true) { + for (int i = 0; i < 2000; i++) { c += one; y += one; z += two; @@ -724,6 +728,10 @@ struct igamma_impl { return nan; } + if (numext::isnan(a) || numext::isnan(x)) { // propagate nans + return nan; + } + if ((x > one) && (x > a)) { /* The checks above ensure that we meet the preconditions for * igammac_impl::Impl(), so call it, rather than igammac_impl::Run(). @@ -770,7 +778,7 @@ struct igamma_impl { c = one; ans = one; - while (true) { + for (int i = 0; i < 2000; i++) { r += one; c *= x/r; ans += c; From 624df5094597ef4427ba8877dcf00804493160fe Mon Sep 17 00:00:00 2001 From: Basil Fierz Date: Thu, 26 Oct 2017 22:44:28 +0200 Subject: [PATCH 225/680] Adds missing EIGEN_STRONG_INLINE to support MSVC properly inlining small vector calculations When working with MSVC often small vector operations are not properly inlined. This behaviour is observed even on the most recent compiler versions. --- Eigen/src/Core/Dot.h | 17 ++++++++++------- Eigen/src/Core/Product.h | 10 +++++----- Eigen/src/Core/ProductEvaluators.h | 20 ++++++++++---------- Eigen/src/Core/Redux.h | 2 +- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index bb8e3fecc..11da432b2 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -31,7 +31,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.template binaryExpr(b).sum(); } @@ -43,7 +44,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.transpose().template binaryExpr(b).sum(); } @@ -65,6 +67,7 @@ struct dot_nocheck template template EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType MatrixBase::dot(const MatrixBase& other) const { @@ -102,7 +105,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits -EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +120,7 @@ EIGEN_DEVICE_FUNC inline typename NumTraits:: * \sa norm(), normalize() */ template -EIGEN_DEVICE_FUNC inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -EIGEN_DEVICE_FUNC inline void MatrixBase::normalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +163,7 @@ EIGEN_DEVICE_FUNC inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -EIGEN_DEVICE_FUNC inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -EIGEN_DEVICE_FUNC inline void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index ae0c94b38..676c48027 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } @@ -127,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - operator const Scalar() const + EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -162,7 +162,7 @@ class ProductImpl public: - EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); @@ -170,7 +170,7 @@ class ProductImpl return internal::evaluator(derived()).coeff(row,col); } - EIGEN_DEVICE_FUNC Scalar coeff(Index i) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 86966abdb..fb637191d 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -32,7 +32,7 @@ struct evaluator > typedef Product XprType; typedef product_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" @@ -55,7 +55,7 @@ struct evaluator, const Product > XprType; typedef evaluator > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {} }; @@ -68,7 +68,7 @@ struct evaluator, DiagIndex> > typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index() )) @@ -246,19 +246,19 @@ template struct generic_product_impl { template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -312,25 +312,25 @@ struct generic_product_impl }; template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } template - static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } template - static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 2b5b73bf7..0a99acb21 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -407,7 +407,7 @@ protected: */ template template -EIGEN_DEVICE_FUNC typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); From 672bdc126b0923e6228a024ce62d1f18b05840ea Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 16 Nov 2017 17:55:24 +0100 Subject: [PATCH 226/680] bug #1479: fix failure detection in LDLT --- Eigen/src/Cholesky/LDLT.h | 2 ++ test/cholesky.cpp | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 968427b3a..13a8f6d14 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -375,6 +375,8 @@ template<> struct ldlt_inplace if((rs>0) && pivot_is_valid) A21 /= realAkk; + else if(rs>0) + ret = ret && (A21.array()==Scalar(0)).all(); if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed else if(!pivot_is_valid) found_zero_pivot = true; diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 8ad5ac639..b4b6bda7d 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -373,6 +373,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(!ldlt.isNegative()); VERIFY(!ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << 1, 2, 2, 1; @@ -380,6 +381,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(!ldlt.isNegative()); VERIFY(!ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << 0, 0, 0, 0; @@ -387,6 +389,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(ldlt.isNegative()); VERIFY(ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << 0, 0, 0, 1; @@ -394,6 +397,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(!ldlt.isNegative()); VERIFY(ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << -1, 0, 0, 0; @@ -401,6 +405,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(ldlt.isNegative()); VERIFY(!ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } } @@ -452,6 +457,18 @@ void cholesky_faillure_cases() VERIFY(ldlt.info()==NumericalIssue); VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); } + + // bug 1479 + { + mat.resize(4,4); + mat << 1, 2, 0, 1, + 2, 4, 0, 2, + 0, 0, 0, 1, + 1, 2, 1, 1; + ldlt.compute(mat); + VERIFY(ldlt.info()==NumericalIssue); + VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); + } } template void cholesky_verify_assert() From 599a88da273ecaf00fec7cf9263a574ed44fef4f Mon Sep 17 00:00:00 2001 From: Zvi Rackover Date: Thu, 16 Nov 2017 19:53:38 +0000 Subject: [PATCH 227/680] Disable gcc-specific workaround for Clang to allow build with AVX512 There is currently a workaround for an issue in gcc that requires invoking gcc with the -fabi-version flag. This workaround is not needed for Clang and moreover is not supported. --- CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62bf823c8..8a068739b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,7 +232,10 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512") + if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") + endif() message(STATUS "Enabling AVX512 in tests/examples") endif() From 3dc6ff73cae64da97244e493905de7be40d1bcde Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 17 Nov 2017 22:54:39 +0100 Subject: [PATCH 228/680] Handle PGI compiler --- cmake/EigenTesting.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 8a995f118..4a34ddef5 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -549,6 +549,8 @@ macro(ei_get_compilerver VAR) else() set(${VAR} "na") endif() + elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI") + set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}") else() # on all other system we rely on ${CMAKE_CXX_COMPILER} # supporting a "--version" or "/version" flag From dd6de618c3fda4275aff3a57c590f82b6e628ac1 Mon Sep 17 00:00:00 2001 From: nluehr Date: Tue, 21 Nov 2017 10:47:00 -0800 Subject: [PATCH 229/680] Fix incorrect integer cast in predux(). Bug corrupts results on Maxwell and earlier GPU architectures. --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ce48e4b31..e3e45b03a 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -231,7 +231,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); + return Eigen::half(__float2half_rn(a1 + a2)); #endif } @@ -265,7 +265,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); + return Eigen::half(__float2half_rn(a1 * a2)); #endif } From 3587e481fb58710272d4aa71d082bf0669f03698 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 27 Nov 2017 21:53:02 +0100 Subject: [PATCH 230/680] silent MSVC warning --- Eigen/src/Core/AssignEvaluator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index dbe435d86..ebf5590de 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -97,7 +97,7 @@ private: public: enum { - Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) From d0b028e1731e7670c5ace4f79c8d48137dd55483 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 27 Nov 2017 22:11:57 +0100 Subject: [PATCH 231/680] clarify Pastix requirements --- Eigen/PaStiXSupport | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport index de3a63b4d..234619acc 100644 --- a/Eigen/PaStiXSupport +++ b/Eigen/PaStiXSupport @@ -36,6 +36,7 @@ extern "C" { * \endcode * * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies. + * This wrapper resuires PaStiX version 5.x compiled without MPI support. * The dependencies depend on how PaSTiX has been compiled. * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task. * From aefd5fd5c4331f0265abb692d4742b558f13f01d Mon Sep 17 00:00:00 2001 From: nluehr Date: Tue, 28 Nov 2017 10:15:46 -0800 Subject: [PATCH 232/680] Replace __float2half_rn with __float2half The latter provides a consistent definition for CUDA 8.0 and 9.0. --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index e3e45b03a..8a6f209c4 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -231,7 +231,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(__float2half_rn(a1 + a2)); + return Eigen::half(__float2half(a1 + a2)); #endif } @@ -265,7 +265,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(__float2half_rn(a1 * a2)); + return Eigen::half(__float2half(a1 * a2)); #endif } From 3122477c8660f4e66e9cf4bf24e4fdfd6d56378c Mon Sep 17 00:00:00 2001 From: Yangzihao Wang Date: Tue, 12 Dec 2017 11:15:24 -0800 Subject: [PATCH 233/680] Update the padding computation for PADDING_SAME to be consistent with TensorFlow. --- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 4 ++ unsupported/test/cxx11_tensor_image_patch.cpp | 52 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 3c6a2e091..91d4ead28 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -265,6 +265,10 @@ struct TensorEvaluator, Device> // Calculate the padding m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + // The padding size calculation for PADDING_SAME has been updated to + // be consistent with how TensorFlow extracts its paddings. + m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); + m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); break; default: eigen_assert(false && "unexpected padding"); diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 475c59651..105d32fb4 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -405,6 +405,57 @@ void test_patch_padding_same() } } +// Verifies that SAME padding, when computed as negative values, will be clipped +// to zero. +void test_patch_padding_same_negative_padding_clip_to_zero() { + int input_depth = 1; + int input_rows = 15; + int input_cols = 1; + int input_batches = 1; + int ksize = 1; // Corresponds to the Rows and Cols for + // tensor.extract_image_patches<>. + int row_stride = 5; + int col_stride = 1; + // ColMajor + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor result = tensor.extract_image_patches( + ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME); + // row padding will be computed as -2 originally and then be clipped to 0. + VERIFY_IS_EQUAL(result.coeff(0), 1.0f); + VERIFY_IS_EQUAL(result.coeff(1), 6.0f); + VERIFY_IS_EQUAL(result.coeff(2), 11.0f); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 3); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // RowMajor + Tensor tensor_row_major = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0)); + + Tensor result_row_major = + tensor_row_major.extract_image_patches(ksize, ksize, row_stride, + col_stride, 1, 1, PADDING_SAME); + VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f); + VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f); + VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f); + + VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0)); +} + void test_patch_no_extra_dim() { Tensor tensor(2,3,5); @@ -754,4 +805,5 @@ void test_cxx11_tensor_image_patch() CALL_SUBTEST_4(test_patch_padding_valid_same_value()); CALL_SUBTEST_5(test_patch_padding_same()); CALL_SUBTEST_6(test_imagenet_patches()); + CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero()); } From b2cacd189e2d3c2d841207029c399b80eecadd01 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Dec 2017 10:01:02 +0100 Subject: [PATCH 234/680] fix header inclusion --- test/klu_support.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/klu_support.cpp b/test/klu_support.cpp index 8b1fdeb41..138dcc301 100644 --- a/test/klu_support.cpp +++ b/test/klu_support.cpp @@ -10,7 +10,7 @@ #define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" -#include +#include template void test_klu_support_T() { From 76c7dae600efc26a6fda212be518b804f101244f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Dec 2017 14:22:14 +0100 Subject: [PATCH 235/680] ignore all *build* sub directories --- .hgignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.hgignore b/.hgignore index dcd9f4431..ebbf746bf 100644 --- a/.hgignore +++ b/.hgignore @@ -13,7 +13,7 @@ core core.* *.bak *~ -build* +*build* *.moc.* *.moc ui_* From 9c3aed9d48d7dbc0f88d2fb92ca232dcbf0d402e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Dec 2017 14:24:33 +0100 Subject: [PATCH 236/680] Fix packet and alignment propagation logic of Block expressions. In particular, (A+B).col(j) lost vectorisation. --- Eigen/src/Core/CoreEvaluators.h | 6 ++++-- test/vectorization_logic.cpp | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 15b361b38..dfdb63e98 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1034,7 +1034,7 @@ struct evaluator > OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), - MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, @@ -1044,7 +1044,9 @@ struct evaluator > Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, PacketAlignment = unpacket_traits::alignment, - Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) + && (OuterStrideAtCompileTime!=0) + && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 83c1439ad..37e7495f5 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -207,6 +207,12 @@ struct vectorization_logic VERIFY(test_redux(Vector1(), LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Vector1().array()*Vector1().array(), + LinearVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux((Vector1().array()*Vector1().array()).col(0), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix(), LinearVectorizedTraversal,CompleteUnrolling)); From 546ab97d7679c75fadbf9e003c7c75faa9a179e5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Dec 2017 14:47:38 +0100 Subject: [PATCH 237/680] Add possibility to overwrite EIGEN_STRONG_INLINE. --- Eigen/src/Core/util/Macros.h | 2 ++ doc/PreprocessorDirectives.dox | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 3a56b0e36..e351b7ad9 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -514,11 +514,13 @@ // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. +#ifndef EIGEN_STRONG_INLINE #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline #endif +#endif // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular, diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index 0919d4190..b6d08c700 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -122,6 +122,10 @@ run time. However, these assertions do cost time and can thus be turned off. this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only, and never called from device code. + - \b \c EIGEN_STRONG_INLINE - This macro is used to qualify critical functions and methods that we expect the compiler to inline. + By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A tipical usage is to + define it to \c inline for MSVC users wanting faster compilation times, at the risk of performance degradations in some rare + cases for which MSVC inliner fails to do a good job. - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default. From 26a2c6fc1676d75f4360a993461d8e2778d454f2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Dec 2017 15:11:04 +0100 Subject: [PATCH 238/680] fix unit test --- test/sparse_basic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 384985028..f84b6e3f5 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -228,8 +228,8 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) ); - m1 = m4; refM1 = refM4; } + m1 = m4; refM1 = refM4; // test aliasing VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1)); From 31e0bda2e3b44b908e63f3b19ade2f4af12c7e10 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 14 Dec 2017 15:48:27 +0100 Subject: [PATCH 239/680] Fix cmake warning --- bench/spbench/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt index 932735698..029ba6d6b 100644 --- a/bench/spbench/CMakeLists.txt +++ b/bench/spbench/CMakeLists.txt @@ -60,7 +60,7 @@ if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS AND BLAS_FOUND) endif(SCOTCH_FOUND) set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) -endif(PASTIX_FOUND AND BLAS_FOUND) +endif() if(METIS_FOUND) include_directories(${METIS_INCLUDE_DIRS}) From 73214c4bd06108ee3a26204acbbda205a40131bc Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Dec 2017 14:10:59 +0100 Subject: [PATCH 240/680] Workaround nvcc 9.0 issue. See PR 351. https://bitbucket.org/eigen/eigen/pull-requests/351 --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0d6331e9c..1d459a3a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -836,7 +836,8 @@ class TensorBase protected: template friend class Tensor; template friend class TensorFixedSize; - template friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; @@ -852,7 +853,8 @@ class TensorBase : public TensorBase { template friend class Tensor; template friend class TensorFixedSize; - template friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() { From 06bf1047f99afec61062e188ec4e68efbf203d86 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 15 Dec 2017 15:15:37 +0100 Subject: [PATCH 241/680] Fix compilation of stableNorm with some expressions as input --- Eigen/src/Core/StableNorm.h | 2 +- test/stable_norm.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index 4ea598ba8..e0e7a0c8f 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -165,7 +165,7 @@ MatrixBase::stableNorm() const typedef typename internal::nested_eval::type DerivedCopy; typedef typename internal::remove_all::type DerivedCopyClean; - DerivedCopy copy(derived()); + const DerivedCopy copy(derived()); enum { CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index c3eb5ff31..3c02474b8 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -65,6 +65,8 @@ template void stable_norm(const MatrixType& m) factor = internal::random(); Scalar small = factor * ((std::numeric_limits::min)() * RealScalar(1e4)); + Scalar one(1); + MatrixType vzero = MatrixType::Zero(rows, cols), vrand = MatrixType::Random(rows, cols), vbig(rows, cols), @@ -78,6 +80,14 @@ template void stable_norm(const MatrixType& m) VERIFY_IS_APPROX(vrand.blueNorm(), vrand.norm()); VERIFY_IS_APPROX(vrand.hypotNorm(), vrand.norm()); + // test with expressions as input + VERIFY_IS_APPROX((one*vrand).stableNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand).blueNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand).hypotNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).stableNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).blueNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).hypotNorm(), vrand.norm()); + RealScalar size = static_cast(m.size()); // test numext::isfinite From f9bdcea022e24bac4a66a937c37de92f7f22b9da Mon Sep 17 00:00:00 2001 From: nluehr Date: Mon, 18 Dec 2017 16:51:15 -0800 Subject: [PATCH 242/680] For cuda 9.1 replace math_functions.hpp with cuda_runtime.h --- Eigen/Core | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index c66359b79..5a6dec8cc 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -54,9 +54,9 @@ #endif #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side - #include + #include #else #define EIGEN_DEVICE_FUNC #endif From 59985cfd26416fb6b196af868c187e90d237c352 Mon Sep 17 00:00:00 2001 From: RJ Ryan Date: Sun, 31 Dec 2017 10:44:56 -0500 Subject: [PATCH 243/680] Disable use of recurrence for computing twiddle factors. Fixes FFT precision issues for large FFTs. https://github.com/tensorflow/tensorflow/issues/10749#issuecomment-354557689 --- .../Eigen/CXX11/src/Tensor/TensorFFT.h | 40 ++++++++++++------- unsupported/test/cxx11_tensor_fft.cpp | 28 +++++++++++++ 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 10e0a8a6b..f81da318c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -231,20 +231,32 @@ struct TensorEvaluator, D // t_n = exp(sqrt(-1) * pi * n^2 / line_len) // for n = 0, 1,..., line_len-1. // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const RealScalar pi_over_len(EIGEN_PI / line_len); - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int j = 2; j < line_len + 1; ++j) { - pos_j_base_powered[j] = pos_j_base_powered[j - 1] * - pos_j_base_powered[j - 1] / - pos_j_base_powered[j - 2] * pos_j_base_sq; - } - } + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast(tmp); } } diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 2f14ebc62..a55369477 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -224,6 +224,32 @@ static void test_fft_real_input_energy() { } } +template +static void test_fft_non_power_of_2_round_trip(int exponent) { + int n = (1 << exponent) + 1; + + Eigen::DSizes dimensions; + dimensions[0] = n; + const DSizes arr = dimensions; + Tensor input; + + input.resize(arr); + input.setRandom(); + + array fft; + fft[0] = 0; + + Tensor, 1, ColMajor> forward = + input.template fft(fft); + + Tensor output = + forward.template fft(fft); + + for (int i = 0; i < n; ++i) { + VERIFY_IS_APPROX(input[i], output[i]); + } +} + void test_cxx11_tensor_fft() { test_fft_complex_input_golden(); test_fft_real_input_golden(); @@ -270,4 +296,6 @@ void test_cxx11_tensor_fft() { test_fft_real_input_energy(); test_fft_real_input_energy(); test_fft_real_input_energy(); + + test_fft_non_power_of_2_round_trip(7); } From 73629f8b680fb4107b490ea99ce7dfc3b5332775 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 9 Jan 2018 08:59:27 +0100 Subject: [PATCH 244/680] Fix gcc7 warning --- Eigen/src/Core/CoreEvaluators.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index dfdb63e98..d1454117a 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -136,7 +136,9 @@ template class plainobjectbase_evaluator_data { public: EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { - EIGEN_ONLY_USED_FOR_DEBUG(outerStride); +#ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(outerStride); +#endif eigen_internal_assert(outerStride==OuterStride); } EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; } From f558ad2955ef4899f208883f46c410273e21451d Mon Sep 17 00:00:00 2001 From: Eugene Chereshnev Date: Wed, 3 Jan 2018 12:55:52 -0800 Subject: [PATCH 245/680] Fix incorrect ldvt in LAPACKE call from JacobiSVD --- Eigen/src/SVD/JacobiSVD_LAPACKE.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h index 50272154f..ff0516f61 100644 --- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h @@ -61,9 +61,10 @@ JacobiSVD, ColPiv u = (LAPACKE_TYPE*)m_matrixU.data(); \ } else { ldu=1; u=&dummy; }\ MatrixType localV; \ - ldvt = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ + lapack_int vt_rows = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ if (computeV()) { \ - localV.resize(ldvt, m_cols); \ + localV.resize(vt_rows, m_cols); \ + ldvt = internal::convert_index(localV.outerStride()); \ vt = (LAPACKE_TYPE*)localV.data(); \ } else { ldvt=1; vt=&dummy; }\ Matrix superb; superb.resize(m_diagSize, 1); \ From 5b3c367926bf0e55cb2ec5359605349909f9f7e4 Mon Sep 17 00:00:00 2001 From: "Lee.Deokjae" Date: Sat, 6 Jan 2018 14:36:19 +0900 Subject: [PATCH 246/680] Fix typos in the contraction example of tensor README --- unsupported/Eigen/CXX11/src/Tensor/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 30d553af7..49cc33c5d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1013,23 +1013,23 @@ multidimensional case. Eigen::Tensor a(2, 3); a.setValues({{1, 2, 3}, {6, 5, 4}}); Eigen::Tensor b(3, 2); - a.setValues({{1, 2}, {4, 5}, {5, 6}}); + b.setValues({{1, 2}, {4, 5}, {5, 6}}); // Compute the traditional matrix product - Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; + Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; Eigen::Tensor AB = a.contract(b, product_dims); // Compute the product of the transpose of the matrices - Eigen::array, 1> transpose_product_dims = { Eigen::IndexPair(0, 1) }; + Eigen::array, 1> transposed_product_dims = { Eigen::IndexPair(0, 1) }; Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); - - // Contraction to scalar value using a ouble contraction - // First coordinate of both tensors are contracted as well as both second coordinates + + // Contraction to scalar value using a double contraction. + // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; - Eigen::Tensor AdoubleontractedA = a.contract(a, double_contraction_product_dims); - + Eigen::Tensor AdoubleContractedA = a.contract(a, double_contraction_product_dims); + // Extracting the scalar value of the tensor contraction for further usage - int value = AdoublecontractedA(0); + int value = AdoubleContractedA(0); ## Reduction Operations From 09a16ba42fa1acc7bb0ace489ee51b3eb958ffa0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 17 Jan 2018 23:13:16 +0100 Subject: [PATCH 247/680] bug #1412: fix compilation with nvcc+MSVC --- Eigen/src/Geometry/Scaling.h | 24 +++++++++++++++++++++--- Eigen/src/SVD/BDCSVD.h | 12 ++++++------ Eigen/src/SVD/JacobiSVD.h | 2 +- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index f58ca03d9..c1899a029 100755 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -29,6 +29,22 @@ namespace Eigen { * * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform */ + +namespace internal +{ + // This helper helps nvcc+MSVC to properly parse this file. + // See bug 1412. + template + struct uniformscaling_times_affine_returntype + { + enum + { + NewMode = int(Mode) == int(Isometry) ? Affine : Mode + }; + typedef Transform type; + }; +} + template class UniformScaling { @@ -60,9 +76,11 @@ public: /** Concatenates a uniform scaling and an affine transformation */ template - inline Transform operator* (const Transform& t) const + inline typename + internal::uniformscaling_times_affine_returntype ::type + operator* (const Transform& t) const { - Transform res = t; + typename internal::uniformscaling_times_affine_returntype res = t; res.prescale(factor()); return res; } @@ -70,7 +88,7 @@ public: /** Concatenates a uniform scaling and a linear transformation matrix */ // TODO returns an expression template - inline typename internal::plain_matrix_type::type operator* (const MatrixBase& other) const + inline typename Eigen::internal::plain_matrix_type::type operator* (const MatrixBase& other) const { return other * m_factor; } template diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 0abd4c1bb..06865a331 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -217,7 +217,7 @@ public: // Method to allocate and initialize matrix and attributes template -void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -393,7 +393,7 @@ void BDCSVD::structured_update(Block A, co //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template -void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { // requires rows = cols + 1; using std::pow; @@ -573,7 +573,7 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template -void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; @@ -1059,7 +1059,7 @@ void BDCSVD::computeSingVecs // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template -void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index size) +void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1088,7 +1088,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template -void BDCSVD::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) +void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1128,7 +1128,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template -void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { using std::sqrt; using std::abs; diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 43488b1e0..1c7c80376 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -610,7 +610,7 @@ template class JacobiSVD }; template -void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); From 5539587b1f5b5922b2419b0a4468cf2f393def51 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jul 2018 10:29:12 +0200 Subject: [PATCH 248/680] Some warning fixes --- Eigen/src/Householder/HouseholderSequence.h | 2 +- Eigen/src/SparseLU/SparseLU.h | 3 --- unsupported/Eigen/src/IterativeSolvers/DGMRES.h | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index af19701a4..e62befcb6 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -356,7 +356,7 @@ template class HouseholderS if(m_length>=BlockSize && dst.cols()>1) { // Make sure we have at least 2 useful blocks, otherwise it is point-less: - Index blockSize = m_length<2*BlockSize ? (m_length+1)/2 : BlockSize; + Index blockSize = m_length::factorize(const MatrixType& matrix) eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices"); - typedef typename IndexVector::Scalar StorageIndex; - m_isInitialized = true; - // Apply the column permutation computed in analyzepattern() // m_mat = matrix * m_perm_c.inverse(); m_mat = matrix; diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h index be039e07f..85a4f696c 100644 --- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h @@ -392,7 +392,6 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr template< typename _MatrixType, typename _Preconditioner> inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur& schurofH) const { - typedef typename MatrixType::Index Index; const DenseMatrix& T = schurofH.matrixT(); Index it = T.rows(); ComplexVector eig(it); From 01fd4096d395e7b816459f571bf2328c8435cc37 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 10 Jul 2018 13:16:38 -0700 Subject: [PATCH 249/680] Fuse computations into the Tensor contractions using output kernel --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 10 +- .../CXX11/src/Tensor/TensorContraction.h | 113 ++++++++++++++---- .../src/Tensor/TensorContractionThreadPool.h | 51 ++++++-- .../src/Tensor/TensorForwardDeclarations.h | 4 +- unsupported/test/cxx11_tensor_contraction.cpp | 51 ++++++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 56 +++++++++ 6 files changed, 248 insertions(+), 37 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index bdc1a17a7..97f90f638 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -517,9 +517,15 @@ class TensorBase typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp + const TensorContractionOp contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp(derived(), other.derived(), dims); + return TensorContractionOp(derived(), other.derived(), dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { + return TensorContractionOp(derived(), other.derived(), dims, output_kernel); } // Convolutions. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 979fcf4d9..85126a127 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -85,8 +85,8 @@ template #endif -template -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename gebp_traits::type, @@ -112,23 +112,24 @@ struct traits > }; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorContractionOp& type; + typedef const TensorContractionOp& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorContractionOp type; + typedef TensorContractionOp type; }; -template -struct traits, Device_> > { +template +struct traits, Device_> > { typedef Indices_ Indices; typedef LeftArgType_ LeftArgType; typedef RightArgType_ RightArgType; + typedef OutputKernelType_ OutputKernelType; typedef Device_ Device; // From NumDims below. @@ -137,8 +138,52 @@ struct traits -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +// Tensor contraction params that should enable to get from output matrix +// 2-dimensional coordinates to the output tensor dimensions. +struct TensorContractionParams { + // TensorContraction evaluator assumes that both tensors are in ColMajor + // layout, if tensors are in RowMajor evaluator swap lhs with rhs. + bool swapped_arguments; +}; + +// Output kernel allows to fuse operations into the tensor contraction. +// +// Examples: +// 1. Elementwise Relu transformation following Conv2D. +// 2. AddBias to the Conv2D output channels dimension. +// +// See expected implementation in NoOpOutputKernel. +struct OutputKernel { + template + using OutputMapper = internal::blas_data_mapper; +}; + +// Output kernel that does absolutely nothing. +struct NoOpOutputKernel { + /** + * Tensor contraction evaluator calls this kernel after finishing each block + * of output matrix. Output blocks belong to the 2-dimensional output tensor. + * + * TensorContractionParams contains contraction dimensions information + * required to map output 2-d space into the expected output tensor space + * (potentially higher dimensional). + * + * \param[in] output_mapper Access to output tensor memory + * \param[in] params Tensor contraction parameters + * \param[in] i Index of a first row available through output_mapper + * \param[in] j Index of a first column available through output_mapper + * \param[in] num_rows Number of available rows + * \param[in] num_cols Number of available columns + */ + template + EIGEN_ALWAYS_INLINE void operator()( + const OutputKernel::OutputMapper& output_mapper, + const TensorContractionParams& params, Index i, Index j, Index num_rows, + Index num_cols) const {} +}; + +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -149,8 +194,10 @@ class TensorContractionOp : public TensorBase::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims, + const OutputKernelType& output_kernel = OutputKernelType()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), + m_output_kernel(output_kernel) {} EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } @@ -164,10 +211,14 @@ class TensorContractionOp : public TensorBase::type& rhsExpression() const { return m_rhs_xpr; } + EIGEN_DEVICE_FUNC + const OutputKernelType& outputKernel() const { return m_output_kernel; } + protected: typename LhsXprType::Nested m_lhs_xpr; typename RhsXprType::Nested m_rhs_xpr; const Indices m_indices; + const OutputKernelType m_output_kernel; }; @@ -177,9 +228,10 @@ struct TensorContractionEvaluatorBase typedef typename internal::traits::Indices Indices; typedef typename internal::traits::LeftArgType LeftArgType; typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::OutputKernelType OutputKernelType; typedef typename internal::traits::Device Device; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -221,6 +273,7 @@ struct TensorContractionEvaluatorBase op.lhsExpression(), op.rhsExpression()), device), m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), + m_output_kernel(op.outputKernel()), m_device(device), m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == @@ -391,6 +444,13 @@ struct TensorContractionEvaluatorBase numext::swap(m_dimensions[i], m_dimensions[j]); } } + + // A set of parameters that will allow output kernel to get from output + // tensor dimensions (i, j) into the original tensor dimensions. + // TODO(ezhulenev): Add parameters required to infer output tensor index for + // more complex contractions than 2x2 on internal dimension. + m_tensor_contraction_params = { + /**swapped_arguments=*/static_cast(Layout) == RowMajor}; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -585,7 +645,15 @@ struct TensorContractionEvaluatorBase // call gebp (matrix kernel) // The parameters here are copied from Eigen's GEMM implementation - gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0); + const auto output_mapper = output.getSubMapper(i2, j2); + gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, + Scalar(1), -1, -1, 0, 0); + + // We are done with this [i2, j2] output block. + if (k2 + kc >= k) { + m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, + actual_mc, actual_nc); + } } } } @@ -848,23 +916,26 @@ protected: Index m_j_size; Index m_k_size; + TensorContractionParams m_tensor_contraction_params; + TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; const Device& m_device; + OutputKernelType m_output_kernel; Scalar* m_result; bool m_can_use_xsmm; }; // evaluator for default device -template -struct TensorEvaluator, Device> : +template +struct TensorEvaluator, Device> : public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 3c007b183..d7536bd6a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -56,16 +56,16 @@ struct packRhsAndKernelArg { } // end namespace internal #endif // EIGEN_USE_SIMPLE_THREAD_POOL -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { typedef ThreadPoolDevice Device; - typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -308,7 +308,7 @@ struct TensorEvaluatorm_k_strides); Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, + OutputMapper>(this, num_threads, lhs, rhs, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack) .run(); @@ -319,16 +319,18 @@ struct TensorEvaluator class Context { public: - Context(const Device& device, int num_threads, LhsMapper& lhs, + Context(const Self* self, int num_threads, LhsMapper& lhs, RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, bool parallel_pack) - : device_(device), + : device_(self->m_device), lhs_(lhs), rhs_(rhs), buffer_(buffer), output_(buffer, tm), + output_kernel_(self->m_output_kernel), + tensor_contraction_params_(self->m_tensor_contraction_params), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), @@ -420,6 +422,8 @@ struct TensorEvaluator::value, + "SimpleThreadPool does not support contraction output kernels."); template void evalProduct(Scalar* buffer) const { @@ -1065,6 +1086,10 @@ struct TensorEvaluator::value, + "XSMM does not support contraction output kernels."); + template class ContextXsmm { public: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 6c237bac3..19e456e19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -65,7 +65,7 @@ template class Ma template class TensorIndexTupleOp; template class TensorTupleReducerOp; template class TensorConcatenationOp; -template class TensorContractionOp; +template class TensorContractionOp; template class TensorConversionOp; template class TensorConvolutionOp; template class TensorFFTOp; @@ -97,6 +97,8 @@ template class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; +class NoOpOutputKernel; + struct DefaultDevice; struct ThreadPoolDevice; struct GpuDevice; diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index ace97057f..918c96277 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -510,6 +510,55 @@ static void test_const_inputs() VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); } +// Apply Sqrt to all output elements. +struct SqrtOutputKernel { + template + EIGEN_ALWAYS_INLINE void operator()( + const OutputKernel::OutputMapper& output_mapper, + const TensorContractionParams&, Index, Index, Index num_rows, + Index num_cols) const { + for (int i = 0; i < num_rows; ++i) { + for (int j = 0; j < num_cols; ++j) { + output_mapper(i, j) = std::sqrt(output_mapper(i, j)); + } + } + } +}; + +template +static void test_large_contraction_with_output_kernel() { + Tensor t_left(30, 50, 8, 31); + Tensor t_right(8, 31, 7, 20, 10); + Tensor t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in mat4 to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + Eigen::Matrix m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result = t_left.contract(t_right, dims, SqrtOutputKernel()); + + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals()); @@ -542,4 +591,6 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_tensor_product()); CALL_SUBTEST(test_const_inputs()); CALL_SUBTEST(test_const_inputs()); + CALL_SUBTEST(test_large_contraction_with_output_kernel()); + CALL_SUBTEST(test_large_contraction_with_output_kernel()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 2ef665f30..ea9d8afdc 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -232,6 +232,60 @@ void test_multithread_contraction_agrees_with_singlethread() { } } +// Apply Sqrt to all output elements. +struct SqrtOutputKernel { + template + EIGEN_ALWAYS_INLINE void operator()( + const OutputKernel::OutputMapper& output_mapper, + const TensorContractionParams&, Index, Index, Index num_rows, + Index num_cols) const { + for (int i = 0; i < num_rows; ++i) { + for (int j = 0; j < num_cols; ++j) { + output_mapper(i, j) = std::sqrt(output_mapper(i, j)); + } + } + } +}; + +template +static void test_multithread_contraction_with_output_kernel() { + typedef Tensor::DimensionPair DimPair; + + const int num_threads = internal::random(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor t_left(30, 50, 8, 31); + Tensor t_right(8, 31, 7, 20, 10); + Tensor t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in mat4 to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + Eigen::Matrix m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); + + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} template void test_full_contraction() { @@ -355,6 +409,8 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel()); + CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel()); // Exercise various cases that have been problematic in the past. CALL_SUBTEST_4(test_contraction_corner_cases()); From b324ed55d969b28ff84343b0840137a6b56300f1 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 12 Jul 2018 14:52:23 -0700 Subject: [PATCH 250/680] Call OutputKernel in evalGemv From e204ecdaafa6c5642a4286a1ffb19e9964e32201 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 16 Jul 2018 15:06:57 -0700 Subject: [PATCH 251/680] Remove SimpleThreadPool and always use {NonBlocking}ThreadPool --- unsupported/Eigen/CXX11/ThreadPool | 13 - .../src/Tensor/TensorContractionThreadPool.h | 324 ------------------ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 - .../src/ThreadPool/NonBlockingThreadPool.h | 16 +- .../CXX11/src/ThreadPool/SimpleThreadPool.h | 162 --------- 5 files changed, 8 insertions(+), 515 deletions(-) delete mode 100644 unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool index c34614194..cbb3bbf2c 100644 --- a/unsupported/Eigen/CXX11/ThreadPool +++ b/unsupported/Eigen/CXX11/ThreadPool @@ -55,21 +55,8 @@ #include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/SimpleThreadPool.h" #include "src/ThreadPool/NonBlockingThreadPool.h" - -// Use the more efficient NonBlockingThreadPool by default. -namespace Eigen { -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif -} // namespace Eigen - #endif #include diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index d7536bd6a..c774d15e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,47 +15,6 @@ namespace Eigen { -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL -namespace internal { - -template -struct packLhsArg { - LhsScalar* blockA; - const LhsMapper& lhs; - const Index m_start; - const Index k_start; - const Index mc; - const Index kc; -}; - -template -struct packRhsAndKernelArg { - const MaxSizeVector* blockAs; - RhsScalar* blockB; - const RhsMapper& rhs; - OutputMapper& output; - const Index m; - const Index k; - const Index n; - const Index mc; - const Index kc; - const Index nc; - const Index num_threads; - const Index num_blockAs; - const Index max_m; - const Index k_block_idx; - const Index m_block_idx; - const Index n_block_idx; - const Index m_blocks; - const Index n_blocks; - MaxSizeVector* kernel_notifications; - const MaxSizeVector* lhs_notifications; - const bool need_to_pack; -}; - -} // end namespace internal -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - template struct TensorEvaluator, ThreadPoolDevice> : public TensorContractionEvaluatorBase, ThreadPoolDevice> > { @@ -112,7 +71,6 @@ struct TensorEvaluator void evalProduct(Scalar* buffer) const { @@ -763,288 +721,6 @@ struct TensorEvaluator::value, - "SimpleThreadPool does not support contraction output kernels."); - - template - void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - evalGemm(buffer); - } - - template - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - - const int lhs_packet_size = internal::unpacket_traits::size; - const int rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // TODO: packing could be faster sometimes if we supported row major tensor mappers - typedef internal::gemm_pack_lhs LhsPacker; - typedef internal::gemm_pack_rhs RhsPacker; - - // TODO: replace false, false with conjugate values? - typedef internal::gebp_kernel GebpKernel; - - typedef internal::packLhsArg packLArg; - typedef internal::packRhsAndKernelArg packRKArg; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // compute block sizes (which depend on number of threads) - const Index num_threads = this->m_device.numThreads(); - internal::TensorContractionBlocking blocking(k, m, n, num_threads); - Index mc = blocking.mc(); - Index nc = blocking.nc(); - Index kc = blocking.kc(); - eigen_assert(mc <= m); - eigen_assert(nc <= n); - eigen_assert(kc <= k); - -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) - const Index k_blocks = CEIL_DIV(k, kc); - const Index n_blocks = CEIL_DIV(n, nc); - const Index m_blocks = CEIL_DIV(m, mc); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; - cout << "num threads: " << num_threads << endl; - */ - - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB - // aren't 16 byte aligned segfaults will happen due to SIMD instructions - // note: You can get away with allocating just a single blockA and offsets and meet the - // the alignment requirements with the assumption that - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = numext::mini(num_threads, m_blocks); - MaxSizeVector blockAs(num_threads); - for (int i = 0; i < num_threads; i++) { - blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); - } - - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. - // Other options: (1) reuse memory when a thread finishes. con: tricky - // (2) allocate block B memory in each thread. con: overhead - MaxSizeVector blockBs(n_blocks); - for (int i = 0; i < n_blocks; i++) { - blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); - } - - // lhs_notifications starts with all null Notifications - MaxSizeVector lhs_notifications(num_threads, nullptr); - - // this should really be numBlockAs * n_blocks; - const Index num_kernel_notifications = num_threads * n_blocks; - MaxSizeVector kernel_notifications(num_kernel_notifications, - nullptr); - - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { - const Index k_start = k_block_idx * kc; - // make sure we don't overshoot right edge of left matrix - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; - - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); - - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { - const Index m_start = mt_block_idx * mc; - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; - eigen_assert(actual_mc > 0); - - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; - - for (int i = 0; i < n_blocks; ++i) { - Index notification_id = (blockAId * n_blocks + i); - // Wait for any current kernels using this slot to complete - // before using it. - if (kernel_notifications[notification_id]) { - wait_until_ready(kernel_notifications[notification_id]); - delete kernel_notifications[notification_id]; - } - kernel_notifications[notification_id] = new Notification(); - } - const packLArg arg = { - blockAs[blockAId], // blockA - lhs, // lhs - m_start, // m - k_start, // k - actual_mc, // mc - actual_kc, // kc - }; - - // Delete any existing notification since we may be - // replacing it. The algorithm should ensure that there are - // no existing waiters on this notification. - delete lhs_notifications[blockAId]; - lhs_notifications[blockAId] = - this->m_device.enqueue(&Self::packLhs, arg); - } - - // now start kernels. - const Index m_base_start = m_block_idx * mc; - const bool need_to_pack = m_block_idx == 0; - - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { - const Index n_start = n_block_idx * nc; - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; - - // first make sure the previous kernels are all done before overwriting rhs. Also wait if - // we're going to start new k. In both cases need_to_pack is true. - if (need_to_pack) { - for (Index i = num_blocks; i < num_threads; ++i) { - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; - Index future_id = (blockAId * n_blocks + n_block_idx); - wait_until_ready(kernel_notifications[future_id]); - } - } - - packRKArg arg = { - &blockAs, // blockA - blockBs[n_block_idx], // blockB - rhs, // rhs - output, // output - m_base_start, // m - k_start, // k - n_start, // n - mc, // mc - actual_kc, // kc - actual_nc, // nc - num_threads, - numBlockAs, - m, - k_block_idx, - m_block_idx, - n_block_idx, // n_block_idx - m_blocks, // m_blocks - n_blocks, // n_blocks - &kernel_notifications, // kernel notifications - &lhs_notifications, // lhs notifications - need_to_pack, // need_to_pack - }; - - // We asynchronously kick off this function, which ends up - // notifying the appropriate kernel_notifications objects, - // which this thread waits on before exiting. - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); - } - } - } - - // Make sure all the kernels are done. - for (size_t i = 0; i < kernel_notifications.size(); ++i) { - wait_until_ready(kernel_notifications[i]); - delete kernel_notifications[i]; - } - - // No need to wait for lhs notifications since they should have - // already been waited on. Just clean them up. - for (size_t i = 0; i < lhs_notifications.size(); ++i) { - delete lhs_notifications[i]; - } - - // deallocate all of the memory for both A and B's - for (size_t i = 0; i < blockAs.size(); i++) { - this->m_device.deallocate(blockAs[i]); - } - for (size_t i = 0; i < blockBs.size(); i++) { - this->m_device.deallocate(blockBs[i]); - } - -#undef CEIL_DIV - } - - /* - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing - * the LHS block, check that all of the kernels that worked on the same - * mt_block_idx in the previous m_block are done. - */ - template - static void packLhs(const packLArg arg) { - // perform actual packing - LhsPacker pack_lhs; - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); - } - - /* - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that - * all kernels in the previous block are done. - * Then for each LHS future, we wait on the future and then call GEBP - * on the area packed by the future (which starts at - * blockA + future_idx * mt * kc) on the LHS and with the full packed - * RHS block. - * The output of this GEBP is written to output(m + i * mt, n). - */ - template - static void packRhsAndKernel(packRKArg arg) { - if (arg.need_to_pack) { - RhsPacker pack_rhs; - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); - } - - GebpKernel gebp; - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { - const Index m_base_start = arg.m + arg.mc*mt_block_idx; - if (m_base_start < arg.max_m) { - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; - wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; - gebp(arg.output.getSubMapper(m_base_start, arg.n), - (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); - - // Notify that the kernel is done. - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - (*arg.kernel_notifications)[set_idx]->Notify(); - } - } - } -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, bool prepacked) const { const int packed_size = std::min(PacketType::size, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 1181c2753..53640c6aa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -150,13 +150,6 @@ class TensorExecutor { if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); - }); -#else size_t num_threads = device.numThreads(); if (num_threads > 1) { num_threads = TensorCostModel::numThreads( @@ -182,7 +175,6 @@ class TensorExecutor { } barrier.Wait(); } -#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } evaluator.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 1264a0270..ecd49f382 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -14,15 +14,15 @@ namespace Eigen { template -class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { +class ThreadPoolTempl : public Eigen::ThreadPoolInterface { public: typedef typename Environment::Task Task; typedef RunQueue Queue; - NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : NonBlockingThreadPoolTempl(num_threads, true, env) {} + ThreadPoolTempl(int num_threads, Environment env = Environment()) + : ThreadPoolTempl(num_threads, true, env) {} - NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, + ThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) : env_(env), num_threads_(num_threads), @@ -66,7 +66,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } } - ~NonBlockingThreadPoolTempl() { + ~ThreadPoolTempl() { done_ = true; // Now if all threads block without work, they will start exiting. @@ -136,7 +136,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { int CurrentThreadId() const final { const PerThread* pt = - const_cast(this)->GetPerThread(); + const_cast(this)->GetPerThread(); if (pt->pool == this) { return pt->thread_id; } else { @@ -149,7 +149,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { struct PerThread { constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { } - NonBlockingThreadPoolTempl* pool; // Parent pool, or null for normal threads. + ThreadPoolTempl* pool; // Parent pool, or null for normal threads. uint64_t rand; // Random generator state. int thread_id; // Worker thread index in pool. }; @@ -337,7 +337,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { } }; -typedef NonBlockingThreadPoolTempl NonBlockingThreadPool; +typedef ThreadPoolTempl ThreadPool; } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h deleted file mode 100644 index 335728665..000000000 --- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h +++ /dev/null @@ -1,162 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H -#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H - -namespace Eigen { - -// The implementation of the ThreadPool type ensures that the Schedule method -// runs the functions it is provided in FIFO order when the scheduling is done -// by a single thread. -// Environment provides a way to create threads and also allows to intercept -// task submission and execution. -template -class SimpleThreadPoolTempl : public ThreadPoolInterface { - public: - // Construct a pool that contains "num_threads" threads. - explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment()) - : env_(env), threads_(num_threads), waiters_(num_threads) { - for (int i = 0; i < num_threads; i++) { - threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); })); - } - } - - // Wait until all scheduled work has finished and then destroy the - // set of threads. - ~SimpleThreadPoolTempl() { - { - // Wait for all work to get done. - std::unique_lock l(mu_); - while (!pending_.empty()) { - empty_.wait(l); - } - exiting_ = true; - - // Wakeup all waiters. - for (auto w : waiters_) { - w->ready = true; - w->task.f = nullptr; - w->cv.notify_one(); - } - } - - // Wait for threads to finish. - for (auto t : threads_) { - delete t; - } - } - - // Schedule fn() for execution in the pool of threads. The functions are - // executed in the order in which they are scheduled. - void Schedule(std::function fn) final { - Task t = env_.CreateTask(std::move(fn)); - std::unique_lock l(mu_); - if (waiters_.empty()) { - pending_.push_back(std::move(t)); - } else { - Waiter* w = waiters_.back(); - waiters_.pop_back(); - w->ready = true; - w->task = std::move(t); - w->cv.notify_one(); - } - } - - void Cancel() { -#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION - for (size_t i = 0; i < threads_.size(); i++) { - threads_[i]->OnCancel(); - } -#endif - } - - int NumThreads() const final { - return static_cast(threads_.size()); - } - - int CurrentThreadId() const final { - const PerThread* pt = this->GetPerThread(); - if (pt->pool == this) { - return pt->thread_id; - } else { - return -1; - } - } - - protected: - void WorkerLoop(int thread_id) { - std::unique_lock l(mu_); - PerThread* pt = GetPerThread(); - pt->pool = this; - pt->thread_id = thread_id; - Waiter w; - Task t; - while (!exiting_) { - if (pending_.empty()) { - // Wait for work to be assigned to me - w.ready = false; - waiters_.push_back(&w); - while (!w.ready) { - w.cv.wait(l); - } - t = w.task; - w.task.f = nullptr; - } else { - // Pick up pending work - t = std::move(pending_.front()); - pending_.pop_front(); - if (pending_.empty()) { - empty_.notify_all(); - } - } - if (t.f) { - mu_.unlock(); - env_.ExecuteTask(t); - t.f = nullptr; - mu_.lock(); - } - } - } - - private: - typedef typename Environment::Task Task; - typedef typename Environment::EnvThread Thread; - - struct Waiter { - std::condition_variable cv; - Task task; - bool ready; - }; - - struct PerThread { - constexpr PerThread() : pool(NULL), thread_id(-1) { } - SimpleThreadPoolTempl* pool; // Parent pool, or null for normal threads. - int thread_id; // Worker thread index in pool. - }; - - Environment env_; - std::mutex mu_; - MaxSizeVector threads_; // All threads - MaxSizeVector waiters_; // Stack of waiting threads. - std::deque pending_; // Queue of pending work - std::condition_variable empty_; // Signaled on pending_.empty() - bool exiting_ = false; - - PerThread* GetPerThread() const { - EIGEN_THREAD_LOCAL PerThread per_thread; - return &per_thread; - } -}; - -typedef SimpleThreadPoolTempl SimpleThreadPool; - -} // namespace Eigen - -#endif // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H From 43206ac4de09e348aaf0c8e7134f59872eaf55f4 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 12 Jul 2018 14:52:23 -0700 Subject: [PATCH 252/680] Call OutputKernel in evalGemv --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 85126a127..0fbffa34c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -549,6 +549,11 @@ struct TensorContractionEvaluatorBase internal::general_matrix_vector_product::run( rows, cols, lhs, rhs, buffer, resIncr, alpha); + + typedef internal::blas_data_mapper OutputMapper; + m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, + static_cast(0), static_cast(0), rows, + static_cast(1)); } template From 2b2cd85694c7d340eb8b3d3ba944d33360666381 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jul 2018 11:11:33 +0200 Subject: [PATCH 253/680] bug #1573: add noexcept move constructor and move assignment operator to Quaternion --- Eigen/src/Geometry/Quaternion.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index c3fd8c3e0..6fad6564a 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -105,6 +105,22 @@ class QuaternionBase : public RotationBase EIGEN_DEVICE_FUNC Derived& operator=(const AngleAxisType& aa); template EIGEN_DEVICE_FUNC Derived& operator=(const MatrixBase& m); +#if EIGEN_HAS_RVALUE_REFERENCES + // Because we have a user-defined copy assignment operator, we don't get an implicit move constructor or assignment operator. + /** Default move constructor */ + EIGEN_DEVICE_FUNC inline QuaternionBase(QuaternionBase&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) = default; + + /** Default move assignment operator */ + EIGEN_DEVICE_FUNC QuaternionBase& operator=(QuaternionBase&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) = default; + + // And now because we declared a constructor, we don't get a default constructor or copy constructor. Say we want them. + /** Default constructor */ + EIGEN_DEVICE_FUNC QuaternionBase() = default; + + /** Copy constructor */ + EIGEN_DEVICE_FUNC QuaternionBase(const QuaternionBase& other) = default; +#endif + /** \returns a quaternion representing an identity rotation * \sa MatrixBase::Identity() */ @@ -276,6 +292,19 @@ public: EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion& other) { m_coeffs = other.coeffs().template cast(); } +#if EIGEN_HAS_RVALUE_REFERENCES + // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. + /** Default move constructor */ + EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) = default; + + /** Default move assignment operator */ + EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) = default; + + // And now because we declared a constructor, we don't get an implicit copy constructor. Say we want one. + /** Default copy constructor */ + EIGEN_DEVICE_FUNC Quaternion(const Quaternion& other) = default; +#endif + EIGEN_DEVICE_FUNC static Quaternion UnitRandom(); template From 37f4bdd97df454f2c85c6047d190f5db21c3a09b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jul 2018 13:20:49 +0200 Subject: [PATCH 254/680] Fix VERIFY_EVALUATION_COUNT(EXPR,N) with a complex expression as N --- test/main.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/main.h b/test/main.h index 95bbc9eb0..51399f7ad 100644 --- a/test/main.h +++ b/test/main.h @@ -122,8 +122,8 @@ inline void on_temporary_creation(long int size) { #define VERIFY_EVALUATION_COUNT(XPR,N) {\ nb_temporaries = 0; \ XPR; \ - if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\ - VERIFY( (#XPR) && nb_temporaries==N ); \ + if(nb_temporaries!=(N)) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\ + VERIFY( (#XPR) && nb_temporaries==(N) ); \ } #endif From 82f0ce27261df3b21037d93d4595655b3df754a6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jul 2018 14:46:15 +0200 Subject: [PATCH 255/680] Get rid of EIGEN_TEST_FUNC, unit tests must now be declared with EIGEN_DECLARE_TEST(mytest) { /* code */ }. This provide several advantages: - more flexibility in designing unit tests - unit tests can be glued to speed up compilation - unit tests are compiled with same predefined macros, which is a requirement for zapcc --- cmake/EigenTesting.cmake | 4 -- test/adjoint.cpp | 2 +- test/array.cpp | 2 +- test/array_for_matrix.cpp | 2 +- test/array_of_string.cpp | 2 +- test/array_replicate.cpp | 2 +- test/array_reverse.cpp | 2 +- test/bandmatrix.cpp | 2 +- test/basicstuff.cpp | 2 +- test/bdcsvd.cpp | 2 +- test/bicgstab.cpp | 2 +- test/block.cpp | 2 +- test/boostmultiprec.cpp | 2 +- test/cholesky.cpp | 2 +- test/cholmod_support.cpp | 2 +- test/commainitializer.cpp | 2 +- test/conjugate_gradient.cpp | 2 +- test/conservative_resize.cpp | 2 +- test/constructor.cpp | 2 +- test/corners.cpp | 2 +- test/ctorleak.cpp | 2 +- test/denseLM.cpp | 2 +- test/dense_storage.cpp | 2 +- test/determinant.cpp | 2 +- test/diagonal.cpp | 2 +- test/diagonalmatrices.cpp | 2 +- test/dontalign.cpp | 2 +- test/dynalloc.cpp | 2 +- test/eigen2support.cpp | 2 +- test/eigensolver_complex.cpp | 2 +- test/eigensolver_generalized_real.cpp | 2 +- test/eigensolver_generic.cpp | 2 +- test/eigensolver_selfadjoint.cpp | 2 +- test/evaluators.cpp | 2 +- test/exceptions.cpp | 2 +- test/fastmath.cpp | 2 +- test/first_aligned.cpp | 2 +- test/geo_alignedbox.cpp | 2 +- test/geo_eulerangles.cpp | 2 +- test/geo_homogeneous.cpp | 2 +- test/geo_hyperplane.cpp | 2 +- test/geo_orthomethods.cpp | 2 +- test/geo_parametrizedline.cpp | 2 +- test/geo_quaternion.cpp | 2 +- test/geo_transformations.cpp | 2 +- test/gpu_basic.cu | 3 +- test/half_float.cpp | 2 +- test/hessenberg.cpp | 2 +- test/householder.cpp | 2 +- test/incomplete_cholesky.cpp | 2 +- test/indexed_view.cpp | 2 +- test/inplace_decomposition.cpp | 2 +- test/integer_types.cpp | 2 +- test/inverse.cpp | 2 +- test/is_same_dense.cpp | 2 +- test/jacobi.cpp | 2 +- test/jacobisvd.cpp | 2 +- test/klu_support.cpp | 2 +- test/linearstructure.cpp | 2 +- test/lscg.cpp | 2 +- test/lu.cpp | 2 +- test/main.h | 69 +++++++++++++------ test/mapped_matrix.cpp | 2 +- test/mapstaticmethods.cpp | 2 +- test/mapstride.cpp | 2 +- test/meta.cpp | 2 +- test/metis_support.cpp | 2 +- test/miscmatrices.cpp | 2 +- test/mixingtypes.cpp | 2 +- test/nesting_ops.cpp | 2 +- test/nomalloc.cpp | 2 +- test/nullary.cpp | 2 +- test/num_dimensions.cpp | 2 +- test/numext.cpp | 2 +- test/packetmath.cpp | 2 +- test/pardiso_support.cpp | 2 +- test/pastix_support.cpp | 2 +- test/permutationmatrices.cpp | 2 +- test/prec_inverse_4x4.cpp | 2 +- test/product_extra.cpp | 2 +- test/product_large.cpp | 2 +- test/product_mmtr.cpp | 2 +- test/product_notemporary.cpp | 2 +- test/product_selfadjoint.cpp | 2 +- test/product_small.cpp | 2 +- test/product_symm.cpp | 2 +- test/product_syrk.cpp | 2 +- test/product_trmm.cpp | 2 +- test/product_trmv.cpp | 2 +- test/product_trsolve.cpp | 2 +- test/qr.cpp | 2 +- test/qr_colpivoting.cpp | 2 +- test/qr_fullpivoting.cpp | 2 +- test/qtvector.cpp | 2 +- test/rand.cpp | 2 +- test/real_qz.cpp | 2 +- test/redux.cpp | 2 +- test/ref.cpp | 2 +- test/resize.cpp | 2 +- test/rvalue_types.cpp | 2 +- test/schur_complex.cpp | 2 +- test/schur_real.cpp | 2 +- test/selfadjoint.cpp | 2 +- test/simplicial_cholesky.cpp | 2 +- test/sizeof.cpp | 2 +- test/sizeoverflow.cpp | 2 +- test/smallvectors.cpp | 2 +- test/sparseLM.cpp | 2 +- test/sparse_basic.cpp | 2 +- test/sparse_block.cpp | 2 +- test/sparse_permutations.cpp | 2 +- test/sparse_product.cpp | 2 +- test/sparse_ref.cpp | 2 +- test/sparse_solvers.cpp | 2 +- test/sparse_vector.cpp | 2 +- test/sparselu.cpp | 2 +- test/sparseqr.cpp | 2 +- test/special_numbers.cpp | 2 +- test/spqr_support.cpp | 2 +- test/stable_norm.cpp | 2 +- test/stddeque.cpp | 2 +- test/stddeque_overload.cpp | 2 +- test/stdlist.cpp | 2 +- test/stdlist_overload.cpp | 2 +- test/stdvector.cpp | 2 +- test/stdvector_overload.cpp | 2 +- test/superlu_support.cpp | 2 +- test/swap.cpp | 2 +- test/symbolic_index.cpp | 2 +- test/triangular.cpp | 2 +- test/umeyama.cpp | 2 +- test/umfpack_support.cpp | 2 +- test/unalignedassert.cpp | 2 +- test/unalignedcount.cpp | 2 +- test/upperbidiagonalization.cpp | 2 +- test/vectorization_logic.cpp | 2 +- test/vectorwiseop.cpp | 2 +- test/visitor.cpp | 2 +- test/zerosized.cpp | 2 +- .../LevenbergMarquardt/LevenbergMarquardt.h | 2 +- unsupported/test/BVH.cpp | 2 +- unsupported/test/EulerAngles.cpp | 2 +- unsupported/test/FFTW.cpp | 2 +- unsupported/test/NonLinearOptimization.cpp | 2 +- unsupported/test/NumericalDiff.cpp | 2 +- unsupported/test/alignedvector3.cpp | 2 +- unsupported/test/autodiff.cpp | 2 +- unsupported/test/autodiff_scalar.cpp | 2 +- unsupported/test/cxx11_eventcount.cpp | 2 +- unsupported/test/cxx11_meta.cpp | 2 +- .../test/cxx11_non_blocking_thread_pool.cpp | 2 +- unsupported/test/cxx11_runqueue.cpp | 2 +- unsupported/test/cxx11_tensor_argmax.cpp | 2 +- unsupported/test/cxx11_tensor_argmax_gpu.cu | 2 +- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_assign.cpp | 2 +- .../test/cxx11_tensor_broadcast_sycl.cpp | 4 +- .../test/cxx11_tensor_broadcasting.cpp | 2 +- .../test/cxx11_tensor_builtins_sycl.cpp | 4 +- .../test/cxx11_tensor_cast_float16_gpu.cu | 2 +- unsupported/test/cxx11_tensor_casts.cpp | 2 +- unsupported/test/cxx11_tensor_chipping.cpp | 2 +- .../test/cxx11_tensor_chipping_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_comparisons.cpp | 2 +- .../cxx11_tensor_complex_cwise_ops_gpu.cu | 2 +- unsupported/test/cxx11_tensor_complex_gpu.cu | 2 +- .../test/cxx11_tensor_concatenation.cpp | 2 +- .../test/cxx11_tensor_concatenation_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_const.cpp | 2 +- unsupported/test/cxx11_tensor_contract_gpu.cu | 2 +- .../test/cxx11_tensor_contract_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_contraction.cpp | 2 +- unsupported/test/cxx11_tensor_convolution.cpp | 2 +- .../test/cxx11_tensor_convolution_sycl.cpp | 4 +- .../test/cxx11_tensor_custom_index.cpp | 2 +- unsupported/test/cxx11_tensor_custom_op.cpp | 2 +- .../test/cxx11_tensor_custom_op_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_device.cu | 2 +- unsupported/test/cxx11_tensor_device_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_dimension.cpp | 2 +- unsupported/test/cxx11_tensor_empty.cpp | 2 +- unsupported/test/cxx11_tensor_expr.cpp | 2 +- unsupported/test/cxx11_tensor_fft.cpp | 2 +- unsupported/test/cxx11_tensor_fixed_size.cpp | 2 +- unsupported/test/cxx11_tensor_forced_eval.cpp | 2 +- .../test/cxx11_tensor_forced_eval_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_generator.cpp | 2 +- .../test/cxx11_tensor_generator_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_gpu.cu | 2 +- unsupported/test/cxx11_tensor_ifft.cpp | 2 +- unsupported/test/cxx11_tensor_image_patch.cpp | 2 +- .../test/cxx11_tensor_image_patch_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_index_list.cpp | 2 +- unsupported/test/cxx11_tensor_inflation.cpp | 2 +- .../test/cxx11_tensor_inflation_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_intdiv.cpp | 2 +- unsupported/test/cxx11_tensor_io.cpp | 2 +- unsupported/test/cxx11_tensor_layout_swap.cpp | 2 +- .../test/cxx11_tensor_layout_swap_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_lvalue.cpp | 2 +- unsupported/test/cxx11_tensor_map.cpp | 2 +- unsupported/test/cxx11_tensor_math.cpp | 2 +- .../test/cxx11_tensor_mixed_indices.cpp | 2 +- unsupported/test/cxx11_tensor_morphing.cpp | 2 +- .../test/cxx11_tensor_morphing_sycl.cpp | 4 +- .../test/cxx11_tensor_notification.cpp | 2 +- unsupported/test/cxx11_tensor_of_complex.cpp | 2 +- .../test/cxx11_tensor_of_const_values.cpp | 2 +- .../test/cxx11_tensor_of_float16_gpu.cu | 2 +- unsupported/test/cxx11_tensor_of_strings.cpp | 2 +- unsupported/test/cxx11_tensor_padding.cpp | 2 +- .../test/cxx11_tensor_padding_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_patch.cpp | 2 +- unsupported/test/cxx11_tensor_patch_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_random.cpp | 2 +- unsupported/test/cxx11_tensor_random_gpu.cu | 2 +- unsupported/test/cxx11_tensor_reduction.cpp | 2 +- .../test/cxx11_tensor_reduction_gpu.cu | 2 +- .../test/cxx11_tensor_reduction_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_ref.cpp | 2 +- unsupported/test/cxx11_tensor_reverse.cpp | 2 +- .../test/cxx11_tensor_reverse_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_roundings.cpp | 2 +- unsupported/test/cxx11_tensor_scan.cpp | 2 +- unsupported/test/cxx11_tensor_scan_gpu.cu | 2 +- unsupported/test/cxx11_tensor_shuffling.cpp | 2 +- .../test/cxx11_tensor_shuffling_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_simple.cpp | 2 +- unsupported/test/cxx11_tensor_striding.cpp | 2 +- .../test/cxx11_tensor_striding_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_sugar.cpp | 2 +- unsupported/test/cxx11_tensor_sycl.cpp | 4 +- unsupported/test/cxx11_tensor_symmetry.cpp | 2 +- unsupported/test/cxx11_tensor_thread_pool.cpp | 2 +- unsupported/test/cxx11_tensor_trace.cpp | 2 +- unsupported/test/cxx11_tensor_uint128.cpp | 2 +- .../test/cxx11_tensor_volume_patch.cpp | 2 +- .../test/cxx11_tensor_volume_patch_sycl.cpp | 4 +- unsupported/test/dgmres.cpp | 2 +- unsupported/test/forward_adolc.cpp | 2 +- unsupported/test/gmres.cpp | 2 +- unsupported/test/kronecker_product.cpp | 4 +- unsupported/test/levenberg_marquardt.cpp | 2 +- unsupported/test/matrix_exponential.cpp | 2 +- unsupported/test/matrix_function.cpp | 2 +- unsupported/test/matrix_power.cpp | 2 +- unsupported/test/matrix_square_root.cpp | 2 +- unsupported/test/minres.cpp | 2 +- unsupported/test/mpreal_support.cpp | 2 +- unsupported/test/openglsupport.cpp | 2 +- unsupported/test/polynomialsolver.cpp | 2 +- unsupported/test/polynomialutils.cpp | 2 +- unsupported/test/sparse_extra.cpp | 2 +- unsupported/test/special_functions.cpp | 2 +- unsupported/test/splines.cpp | 2 +- 255 files changed, 325 insertions(+), 303 deletions(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index d60873554..35deed509 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -68,8 +68,6 @@ macro(ei_add_test_internal testname testname_with_suffix) ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}") - if(MSVC) ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") endif() @@ -170,8 +168,6 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix) ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}") - if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS) ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") endif() diff --git a/test/adjoint.cpp b/test/adjoint.cpp index 37032d220..ca44466b2 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -145,7 +145,7 @@ template void adjoint(const MatrixType& m) VERIFY_IS_APPROX(rv1.template cast().dot(v1), rv1.dot(v1)); } -void test_adjoint() +EIGEN_DECLARE_TEST(adjoint) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( adjoint(Matrix()) ); diff --git a/test/array.cpp b/test/array.cpp index 7e228ed54..c01653668 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -449,7 +449,7 @@ template void min_max(const ArrayType& m) } -void test_array() +EIGEN_DECLARE_TEST(array) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( array(Array()) ); diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index 3dc5dc487..6b03abb10 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -256,7 +256,7 @@ void regrrssion_bug_1410() VERIFY((internal::traits >::Flags&LvalueBit)==LvalueBit); } -void test_array_for_matrix() +EIGEN_DECLARE_TEST(array_for_matrix) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( array_for_matrix(Matrix()) ); diff --git a/test/array_of_string.cpp b/test/array_of_string.cpp index e23b7c59e..23e51529b 100644 --- a/test/array_of_string.cpp +++ b/test/array_of_string.cpp @@ -9,7 +9,7 @@ #include "main.h" -void test_array_of_string() +EIGEN_DECLARE_TEST(array_of_string) { typedef Array ArrayXs; ArrayXs a1(3), a2(3), a3(3), a3ref(3); diff --git a/test/array_replicate.cpp b/test/array_replicate.cpp index 0dad5bace..057c3c77b 100644 --- a/test/array_replicate.cpp +++ b/test/array_replicate.cpp @@ -68,7 +68,7 @@ template void replicate(const MatrixType& m) VERIFY_IS_APPROX(vx1, v1.colwise().replicate(f2)); } -void test_array_replicate() +EIGEN_DECLARE_TEST(array_replicate) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( replicate(Matrix()) ); diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index 9d5b9a66d..317b9a6e2 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -123,7 +123,7 @@ template void reverse(const MatrixType& m) VERIFY_IS_APPROX(x, m1(r, cols - 1 - c)); } -void test_array_reverse() +EIGEN_DECLARE_TEST(array_reverse) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( reverse(Matrix()) ); diff --git a/test/bandmatrix.cpp b/test/bandmatrix.cpp index f8c38f7c3..66a1b0db4 100644 --- a/test/bandmatrix.cpp +++ b/test/bandmatrix.cpp @@ -59,7 +59,7 @@ template void bandmatrix(const MatrixType& _m) using Eigen::internal::BandMatrix; -void test_bandmatrix() +EIGEN_DECLARE_TEST(bandmatrix) { for(int i = 0; i < 10*g_repeat ; i++) { Index rows = internal::random(1,10); diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp index 0fbeb42eb..5ad4090cd 100644 --- a/test/basicstuff.cpp +++ b/test/basicstuff.cpp @@ -268,7 +268,7 @@ void fixedSizeMatrixConstruction() } } -void test_basicstuff() +EIGEN_DECLARE_TEST(basicstuff) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( basicStuff(Matrix()) ); diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index 109218766..3065ff015 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -62,7 +62,7 @@ void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computa if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); } -void test_bdcsvd() +EIGEN_DECLARE_TEST(bdcsvd) { CALL_SUBTEST_3(( svd_verify_assert >(Matrix3f()) )); CALL_SUBTEST_4(( svd_verify_assert >(Matrix4d()) )); diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp index 4cc0dd31c..89d6a45ef 100644 --- a/test/bicgstab.cpp +++ b/test/bicgstab.cpp @@ -26,7 +26,7 @@ template void test_bicgstab_T() //CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ssor) ); } -void test_bicgstab() +EIGEN_DECLARE_TEST(bicgstab) { CALL_SUBTEST_1((test_bicgstab_T()) ); CALL_SUBTEST_2((test_bicgstab_T, int>())); diff --git a/test/block.cpp b/test/block.cpp index a45dd501d..ca13539a9 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -276,7 +276,7 @@ void data_and_stride(const MatrixType& m) compare_using_data_and_stride(m1.col(c1).transpose()); } -void test_block() +EIGEN_DECLARE_TEST(block) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( block(Matrix()) ); diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp index bafbd3b96..1588eb86d 100644 --- a/test/boostmultiprec.cpp +++ b/test/boostmultiprec.cpp @@ -145,7 +145,7 @@ namespace Eigen { } -void test_boostmultiprec() +EIGEN_DECLARE_TEST(boostmultiprec) { typedef Matrix Mat; typedef Matrix,Dynamic,Dynamic> MatC; diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 62c391de4..b871351e0 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -488,7 +488,7 @@ template void cholesky_verify_assert() VERIFY_RAISES_ASSERT(ldlt.solveInPlace(&tmp)) } -void test_cholesky() +EIGEN_DECLARE_TEST(cholesky) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp index 931207334..89b9cf41e 100644 --- a/test/cholmod_support.cpp +++ b/test/cholmod_support.cpp @@ -55,7 +55,7 @@ template void test_cholmod_T() test_cholmod_ST >(); } -void test_cholmod_support() +EIGEN_DECLARE_TEST(cholmod_support) { CALL_SUBTEST_11( (test_cholmod_T()) ); CALL_SUBTEST_12( (test_cholmod_T()) ); diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp index 9844adbd2..3cb94da62 100644 --- a/test/commainitializer.cpp +++ b/test/commainitializer.cpp @@ -65,7 +65,7 @@ struct test_block_recursion<-1> static void run() { } }; -void test_commainitializer() +EIGEN_DECLARE_TEST(commainitializer) { Matrix3d m3; Matrix4d m4; diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 9622fd86d..47a4ca707 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -26,7 +26,7 @@ template void test_conjugate_gradient_T() CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I) ); } -void test_conjugate_gradient() +EIGEN_DECLARE_TEST(conjugate_gradient) { CALL_SUBTEST_1(( test_conjugate_gradient_T() )); CALL_SUBTEST_2(( test_conjugate_gradient_T, int>() )); diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp index 21a1db4ac..d3d0a5a31 100644 --- a/test/conservative_resize.cpp +++ b/test/conservative_resize.cpp @@ -109,7 +109,7 @@ void run_vector_tests() } } -void test_conservative_resize() +EIGEN_DECLARE_TEST(conservative_resize) { for(int i=0; i void ctor_init1(const MatrixType& m) } -void test_constructor() +EIGEN_DECLARE_TEST(constructor) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( ctor_init1(Matrix()) ); diff --git a/test/corners.cpp b/test/corners.cpp index 32edadb25..73342a8dd 100644 --- a/test/corners.cpp +++ b/test/corners.cpp @@ -101,7 +101,7 @@ template void c VERIFY_IS_EQUAL((const_matrix.template rightCols()), (const_matrix.template block(0,cols-c))); } -void test_corners() +EIGEN_DECLARE_TEST(corners) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( corners(Matrix()) ); diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp index c158f5e4e..7202e90dd 100644 --- a/test/ctorleak.cpp +++ b/test/ctorleak.cpp @@ -33,7 +33,7 @@ Index Foo::object_limit = 0; #undef EIGEN_TEST_MAX_SIZE #define EIGEN_TEST_MAX_SIZE 3 -void test_ctorleak() +EIGEN_DECLARE_TEST(ctorleak) { typedef Matrix MatrixX; typedef Matrix VectorX; diff --git a/test/denseLM.cpp b/test/denseLM.cpp index 0aa736ea3..afb8004b1 100644 --- a/test/denseLM.cpp +++ b/test/denseLM.cpp @@ -182,7 +182,7 @@ void test_denseLM_T() } -void test_denseLM() +EIGEN_DECLARE_TEST(denseLM) { CALL_SUBTEST_2(test_denseLM_T()); diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index e63712b1a..1150ec52b 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp @@ -52,7 +52,7 @@ void dense_storage_assignment() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } -void test_dense_storage() +EIGEN_DECLARE_TEST(dense_storage) { dense_storage_copy(); dense_storage_copy(); diff --git a/test/determinant.cpp b/test/determinant.cpp index b8c9babb3..7dd33c373 100644 --- a/test/determinant.cpp +++ b/test/determinant.cpp @@ -50,7 +50,7 @@ template void determinant(const MatrixType& m) VERIFY_IS_APPROX(m2.block(0,0,0,0).determinant(), Scalar(1)); } -void test_determinant() +EIGEN_DECLARE_TEST(determinant) { for(int i = 0; i < g_repeat; i++) { int s = 0; diff --git a/test/diagonal.cpp b/test/diagonal.cpp index 8ed9b4682..4e8c4b3c9 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -88,7 +88,7 @@ template void diagonal_assert(const MatrixType& m) { VERIFY_RAISES_ASSERT( m1.diagonal(-(rows+1)) ); } -void test_diagonal() +EIGEN_DECLARE_TEST(diagonal) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( diagonal(Matrix()) ); diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index c55733df8..ba58ca8d1 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -144,7 +144,7 @@ void bug987() VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() ); } -void test_diagonalmatrices() +EIGEN_DECLARE_TEST(diagonalmatrices) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( diagonalmatrices(Matrix()) ); diff --git a/test/dontalign.cpp b/test/dontalign.cpp index ac00112ed..2e4102b86 100644 --- a/test/dontalign.cpp +++ b/test/dontalign.cpp @@ -44,7 +44,7 @@ void dontalign(const MatrixType& m) internal::aligned_delete(array, rows); } -void test_dontalign() +EIGEN_DECLARE_TEST(dontalign) { #if defined EIGEN_TEST_PART_1 || defined EIGEN_TEST_PART_5 dontalign(Matrix3d()); diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp index f1cc70bee..ceecd76e3 100644 --- a/test/dynalloc.cpp +++ b/test/dynalloc.cpp @@ -119,7 +119,7 @@ template void check_custom_new_delete() #endif } -void test_dynalloc() +EIGEN_DECLARE_TEST(dynalloc) { // low level dynamic memory allocation CALL_SUBTEST(check_handmade_aligned_malloc()); diff --git a/test/eigen2support.cpp b/test/eigen2support.cpp index ac6931a0e..49d7328e9 100644 --- a/test/eigen2support.cpp +++ b/test/eigen2support.cpp @@ -52,7 +52,7 @@ template void eigen2support(const MatrixType& m) m1.minor(0,0); } -void test_eigen2support() +EIGEN_DECLARE_TEST(eigen2support) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( eigen2support(Matrix()) ); diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp index 8f279dc00..c5373f420 100644 --- a/test/eigensolver_complex.cpp +++ b/test/eigensolver_complex.cpp @@ -152,7 +152,7 @@ template void eigensolver_verify_assert(const MatrixType& m VERIFY_RAISES_ASSERT(eig.eigenvectors()); } -void test_eigensolver_complex() +EIGEN_DECLARE_TEST(eigensolver_complex) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp index 9dd44c89d..95ed431db 100644 --- a/test/eigensolver_generalized_real.cpp +++ b/test/eigensolver_generalized_real.cpp @@ -85,7 +85,7 @@ template void generalized_eigensolver_real(const MatrixType } } -void test_eigensolver_generalized_real() +EIGEN_DECLARE_TEST(eigensolver_generalized_real) { for(int i = 0; i < g_repeat; i++) { int s = 0; diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index 07bf65e03..b41186819 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -100,7 +100,7 @@ template void eigensolver_verify_assert(const MatrixType& m VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors()); } -void test_eigensolver_generic() +EIGEN_DECLARE_TEST(eigensolver_generic) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 0e39b5364..65b80c3fb 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -230,7 +230,7 @@ void bug_1204() SelfAdjointEigenSolver > eig(A); } -void test_eigensolver_selfadjoint() +EIGEN_DECLARE_TEST(eigensolver_selfadjoint) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/evaluators.cpp b/test/evaluators.cpp index aed5a05a7..f4fdaf053 100644 --- a/test/evaluators.cpp +++ b/test/evaluators.cpp @@ -101,7 +101,7 @@ using namespace std; #define VERIFY_IS_APPROX_EVALUATOR(DEST,EXPR) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (EXPR).eval()); #define VERIFY_IS_APPROX_EVALUATOR2(DEST,EXPR,REF) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (REF).eval()); -void test_evaluators() +EIGEN_DECLARE_TEST(evaluators) { // Testing Matrix evaluator and Transpose Vector2d v = Vector2d::Random(); diff --git a/test/exceptions.cpp b/test/exceptions.cpp index b83fb82ba..182082e39 100644 --- a/test/exceptions.cpp +++ b/test/exceptions.cpp @@ -107,7 +107,7 @@ void memoryleak() VERIFY(ScalarWithExceptions::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP)); \ } -void test_exceptions() +EIGEN_DECLARE_TEST(exceptions) { CALL_SUBTEST( memoryleak() ); } diff --git a/test/fastmath.cpp b/test/fastmath.cpp index cc5db0746..c30f0a846 100644 --- a/test/fastmath.cpp +++ b/test/fastmath.cpp @@ -88,7 +88,7 @@ void check_inf_nan(bool dryrun) { } } -void test_fastmath() { +EIGEN_DECLARE_TEST(fastmath) { std::cout << "*** float *** \n\n"; check_inf_nan(true); std::cout << "*** double ***\n\n"; check_inf_nan(true); std::cout << "*** long double *** \n\n"; check_inf_nan(true); diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp index ae2d4bc42..ed9945077 100644 --- a/test/first_aligned.cpp +++ b/test/first_aligned.cpp @@ -26,7 +26,7 @@ void test_none_aligned_helper(Scalar *array, int size) struct some_non_vectorizable_type { float x; }; -void test_first_aligned() +EIGEN_DECLARE_TEST(first_aligned) { EIGEN_ALIGN16 float array_float[100]; test_first_aligned_helper(array_float, 50); diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp index 87bf1140f..c6c051ce4 100644 --- a/test/geo_alignedbox.cpp +++ b/test/geo_alignedbox.cpp @@ -169,7 +169,7 @@ void specificTest2() } -void test_geo_alignedbox() +EIGEN_DECLARE_TEST(geo_alignedbox) { for(int i = 0; i < g_repeat; i++) { diff --git a/test/geo_eulerangles.cpp b/test/geo_eulerangles.cpp index 932ebe773..693c627a9 100644 --- a/test/geo_eulerangles.cpp +++ b/test/geo_eulerangles.cpp @@ -103,7 +103,7 @@ template void eulerangles() check_all_var(ea); } -void test_geo_eulerangles() +EIGEN_DECLARE_TEST(geo_eulerangles) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( eulerangles() ); diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp index 2187c7bf9..9aebe6226 100644 --- a/test/geo_homogeneous.cpp +++ b/test/geo_homogeneous.cpp @@ -115,7 +115,7 @@ template void homogeneous(void) VERIFY_IS_APPROX( (t2.template triangularView() * v0.homogeneous()).eval(), (t2.template triangularView()*hv0) ); } -void test_geo_homogeneous() +EIGEN_DECLARE_TEST(geo_homogeneous) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( homogeneous() )); diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index b3a48c585..a26709301 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -180,7 +180,7 @@ template void hyperplane_alignment() } -void test_geo_hyperplane() +EIGEN_DECLARE_TEST(geo_hyperplane) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( hyperplane(Hyperplane()) ); diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp index e178df257..b7b660740 100644 --- a/test/geo_orthomethods.cpp +++ b/test/geo_orthomethods.cpp @@ -115,7 +115,7 @@ template void orthomethods(int size=Size) VERIFY_IS_APPROX(mcrossN3.row(i), matN3.row(i).cross(vec3)); } -void test_geo_orthomethods() +EIGEN_DECLARE_TEST(geo_orthomethods) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( orthomethods_3() ); diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index 14cf4fd48..7135c8fa5 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -117,7 +117,7 @@ template void parametrizedline_alignment() #endif } -void test_geo_parametrizedline() +EIGEN_DECLARE_TEST(geo_parametrizedline) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( parametrizedline(ParametrizedLine()) ); diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 5854d39c5..3541b8b25 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -285,7 +285,7 @@ template void check_const_correctness(const PlainObjec VERIFY( !(Map::Flags & LvalueBit) ); } -void test_geo_quaternion() +EIGEN_DECLARE_TEST(geo_quaternion) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( quaternion() )); diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index 278e527c2..bf920696b 100755 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -612,7 +612,7 @@ template void transform_products() VERIFY_IS_APPROX((ac*p).matrix(), a_m*p_m); } -void test_geo_transformations() +EIGEN_DECLARE_TEST(geo_transformations) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( transformations() )); diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index 285b87910..e8069f185 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -15,7 +15,6 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC gpu_basic #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #include "main.h" @@ -162,7 +161,7 @@ struct matrix_inverse { } }; -void test_gpu_basic() +EIGEN_DECLARE_TEST(gpu_basic) { ei_test_init_gpu(); diff --git a/test/half_float.cpp b/test/half_float.cpp index 5a881680a..2a7f9b497 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -272,7 +272,7 @@ void test_product() VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast()); } -void test_half_float() +EIGEN_DECLARE_TEST(half_float) { CALL_SUBTEST(test_numtraits()); for(int i = 0; i < g_repeat; i++) { diff --git a/test/hessenberg.cpp b/test/hessenberg.cpp index 96bc19e2e..0e1b0098d 100644 --- a/test/hessenberg.cpp +++ b/test/hessenberg.cpp @@ -49,7 +49,7 @@ template void hessenberg(int size = Size) // TODO: Add tests for packedMatrix() and householderCoefficients() } -void test_hessenberg() +EIGEN_DECLARE_TEST(hessenberg) { CALL_SUBTEST_1(( hessenberg,1>() )); CALL_SUBTEST_2(( hessenberg,2>() )); diff --git a/test/householder.cpp b/test/householder.cpp index ccda46811..cad8138a2 100644 --- a/test/householder.cpp +++ b/test/householder.cpp @@ -133,7 +133,7 @@ template void householder(const MatrixType& m) VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating rhseq to a dense matrix, then applying } -void test_householder() +EIGEN_DECLARE_TEST(householder) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( householder(Matrix()) ); diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 59ffe9259..52235c2c2 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -29,7 +29,7 @@ template void test_incomplete_cholesky_T() CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) ); } -void test_incomplete_cholesky() +EIGEN_DECLARE_TEST(incomplete_cholesky) { CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 551dc55b0..4cc0954f7 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -388,7 +388,7 @@ void check_indexed_view() } -void test_indexed_view() +EIGEN_DECLARE_TEST(indexed_view) { // for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_indexed_view() ); diff --git a/test/inplace_decomposition.cpp b/test/inplace_decomposition.cpp index 92d0d91b6..e3aa9957d 100644 --- a/test/inplace_decomposition.cpp +++ b/test/inplace_decomposition.cpp @@ -79,7 +79,7 @@ template void inplace(bool square = false, } -void test_inplace_decomposition() +EIGEN_DECLARE_TEST(inplace_decomposition) { EIGEN_UNUSED typedef Matrix Matrix43d; for(int i = 0; i < g_repeat; i++) { diff --git a/test/integer_types.cpp b/test/integer_types.cpp index 36295598f..88573c568 100644 --- a/test/integer_types.cpp +++ b/test/integer_types.cpp @@ -131,7 +131,7 @@ template void integer_type_tests(const MatrixType& m) VERIFY_IS_APPROX((m1 * m2.transpose()) * m1, m1 * (m2.transpose() * m1)); } -void test_integer_types() +EIGEN_DECLARE_TEST(integer_types) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( integer_type_tests(Matrix()) ); diff --git a/test/inverse.cpp b/test/inverse.cpp index be607cc8b..b989c7905 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -92,7 +92,7 @@ template void inverse(const MatrixType& m) } } -void test_inverse() +EIGEN_DECLARE_TEST(inverse) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp index c4e2fbc92..23dd806eb 100644 --- a/test/is_same_dense.cpp +++ b/test/is_same_dense.cpp @@ -11,7 +11,7 @@ using internal::is_same_dense; -void test_is_same_dense() +EIGEN_DECLARE_TEST(is_same_dense) { typedef Matrix ColMatrixXd; typedef Matrix,Dynamic,Dynamic,ColMajor> ColMatrixXcd; diff --git a/test/jacobi.cpp b/test/jacobi.cpp index 319e4767a..5604797f5 100644 --- a/test/jacobi.cpp +++ b/test/jacobi.cpp @@ -57,7 +57,7 @@ void jacobi(const MatrixType& m = MatrixType()) } } -void test_jacobi() +EIGEN_DECLARE_TEST(jacobi) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( jacobi() )); diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 72007bff5..f9a59e0e7 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -69,7 +69,7 @@ void jacobisvd_method() VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m); } -void test_jacobisvd() +EIGEN_DECLARE_TEST(jacobisvd) { CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) )); CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) )); diff --git a/test/klu_support.cpp b/test/klu_support.cpp index 138dcc301..f806ad50e 100644 --- a/test/klu_support.cpp +++ b/test/klu_support.cpp @@ -24,7 +24,7 @@ template void test_klu_support_T() //check_sparse_square_determinant(umfpack_rowmajor); } -void test_klu_support() +EIGEN_DECLARE_TEST(klu_support) { CALL_SUBTEST_1(test_klu_support_T()); CALL_SUBTEST_2(test_klu_support_T >()); diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp index b6559b2a0..4137b0251 100644 --- a/test/linearstructure.cpp +++ b/test/linearstructure.cpp @@ -110,7 +110,7 @@ template void real_complex(DenseIndex rows = MatrixType::Ro VERIFY(g_called && "matrix - real not properly optimized"); } -void test_linearstructure() +EIGEN_DECLARE_TEST(linearstructure) { g_called = true; VERIFY(g_called); // avoid `unneeded-internal-declaration` warning. diff --git a/test/lscg.cpp b/test/lscg.cpp index d49ee00c3..feb2347a8 100644 --- a/test/lscg.cpp +++ b/test/lscg.cpp @@ -30,7 +30,7 @@ template void test_lscg_T() CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_I) ); } -void test_lscg() +EIGEN_DECLARE_TEST(lscg) { CALL_SUBTEST_1(test_lscg_T()); CALL_SUBTEST_2(test_lscg_T >()); diff --git a/test/lu.cpp b/test/lu.cpp index cad703dd0..144496e91 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -242,7 +242,7 @@ template void lu_verify_assert() VERIFY_RAISES_ASSERT(plu.inverse()) } -void test_lu() +EIGEN_DECLARE_TEST(lu) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( lu_non_invertible() ); diff --git a/test/main.h b/test/main.h index 51399f7ad..5d64bc736 100644 --- a/test/main.h +++ b/test/main.h @@ -128,7 +128,6 @@ inline void on_temporary_creation(long int size) { #endif -// the following file is automatically generated by cmake #include "split_test_helper.h" #ifdef NDEBUG @@ -145,10 +144,6 @@ inline void on_temporary_creation(long int size) { #define EIGEN_MAKING_DOCS #endif -#ifndef EIGEN_TEST_FUNC -#error EIGEN_TEST_FUNC must be defined -#endif - #define DEFAULT_REPEAT 10 namespace Eigen @@ -157,17 +152,45 @@ namespace Eigen // level == 0 <=> abort if test fail // level >= 1 <=> warning message to std::cerr if test fail static int g_test_level = 0; - static int g_repeat; - static unsigned int g_seed; - static bool g_has_set_repeat, g_has_set_seed; + static int g_repeat = 1; + static unsigned int g_seed = 0; + static bool g_has_set_repeat = false, g_has_set_seed = false; + + class EigenTest + { + public: + EigenTest() : m_func(0) {} + EigenTest(const char* a_name, void (*func)(void)) + : m_name(a_name), m_func(func) + { + ms_registered_tests.push_back(this); + } + const std::string& name() const { return m_name; } + void operator()() const { m_func(); } + + static const std::vector& all() { return ms_registered_tests; } + protected: + std::string m_name; + void (*m_func)(void); + static std::vector ms_registered_tests; + }; + + std::vector EigenTest::ms_registered_tests; + + // Declare and register a test, e.g.: + // EIGEN_DECLARE_TEST(mytest) { ... } + // will create a function: + // void test_mytest() { ... } + // that will be automatically called. + #define EIGEN_DECLARE_TEST(X) \ + void EIGEN_CAT(test_,X) (); \ + static EigenTest EIGEN_CAT(test_handler_,X) (EIGEN_MAKESTRING(X), & EIGEN_CAT(test_,X)); \ + void EIGEN_CAT(test_,X) () } #define TRACK std::cerr << __FILE__ << " " << __LINE__ << std::endl // #define TRACK while() -#define EI_PP_MAKE_STRING2(S) #S -#define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S) - #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "") #if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) @@ -225,7 +248,7 @@ namespace Eigen } \ else if (Eigen::internal::push_assert) \ { \ - eigen_assert_list.push_back(std::string(EI_PP_MAKE_STRING(__FILE__) " (" EI_PP_MAKE_STRING(__LINE__) ") : " #a) ); \ + eigen_assert_list.push_back(std::string(EIGEN_MAKESTRING(__FILE__) " (" EIGEN_MAKESTRING(__LINE__) ") : " #a) ); \ } #ifdef EIGEN_EXCEPTIONS @@ -338,10 +361,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file, } } -#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) +#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a)) -#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b)) -#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b)) +#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a >= b)) +#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a <= b)) #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true)) @@ -358,7 +381,7 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define STATIC_CHECK(COND) EIGEN_STATIC_ASSERT( (COND) , EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT ) #define CALL_SUBTEST(FUNC) do { \ - g_test_stack.push_back(EI_PP_MAKE_STRING(FUNC)); \ + g_test_stack.push_back(EIGEN_MAKESTRING(FUNC)); \ FUNC; \ g_test_stack.pop_back(); \ } while (0) @@ -709,9 +732,6 @@ template<> std::string type_name >() { return "comple template<> std::string type_name >() { return "complex"; } template<> std::string type_name >() { return "complex"; } -// forward declaration of the main test function -void EIGEN_CAT(test_,EIGEN_TEST_FUNC)(); - using namespace Eigen; inline void set_repeat_from_string(const char *str) @@ -798,9 +818,16 @@ int main(int argc, char *argv[]) srand(g_seed); std::cout << "Repeating each test " << g_repeat << " times" << std::endl; - Eigen::g_test_stack.push_back(std::string(EI_PP_MAKE_STRING(EIGEN_TEST_FUNC))); + VERIFY(EigenTest::all().size()>0); + + for(std::size_t i=0; i()) ); diff --git a/test/mapstaticmethods.cpp b/test/mapstaticmethods.cpp index 8156ca939..f32c0beec 100644 --- a/test/mapstaticmethods.cpp +++ b/test/mapstaticmethods.cpp @@ -143,7 +143,7 @@ void mapstaticmethods(const PlainObjectType& m) VERIFY(true); // just to avoid 'unused function' warning } -void test_mapstaticmethods() +EIGEN_DECLARE_TEST(mapstaticmethods) { ptr = internal::aligned_new(1000); for(int i = 0; i < 1000; i++) ptr[i] = float(i); diff --git a/test/mapstride.cpp b/test/mapstride.cpp index d785148cf..09196600b 100644 --- a/test/mapstride.cpp +++ b/test/mapstride.cpp @@ -197,7 +197,7 @@ void bug1453() VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); } -void test_mapstride() +EIGEN_DECLARE_TEST(mapstride) { for(int i = 0; i < g_repeat; i++) { int maxn = 30; diff --git a/test/meta.cpp b/test/meta.cpp index 34c5520a6..a6a67b85c 100644 --- a/test/meta.cpp +++ b/test/meta.cpp @@ -27,7 +27,7 @@ struct MyImpl : public MyInterface { void func() {} }; -void test_meta() +EIGEN_DECLARE_TEST(meta) { VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value)); VERIFY(( internal::is_same::value)); diff --git a/test/metis_support.cpp b/test/metis_support.cpp index d87c56a13..b490dacde 100644 --- a/test/metis_support.cpp +++ b/test/metis_support.cpp @@ -19,7 +19,7 @@ template void test_metis_T() check_sparse_square_solving(sparselu_metis); } -void test_metis_support() +EIGEN_DECLARE_TEST(metis_support) { CALL_SUBTEST_1(test_metis_T()); } diff --git a/test/miscmatrices.cpp b/test/miscmatrices.cpp index f17291c40..e71712f33 100644 --- a/test/miscmatrices.cpp +++ b/test/miscmatrices.cpp @@ -34,7 +34,7 @@ template void miscMatrices(const MatrixType& m) VERIFY_IS_APPROX(square, MatrixType::Identity(rows, rows)); } -void test_miscmatrices() +EIGEN_DECLARE_TEST(miscmatrices) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( miscMatrices(Matrix()) ); diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index b796082cd..38f062f1e 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -286,7 +286,7 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md, - ((md*md).eval().template cast()) ); } -void test_mixingtypes() +EIGEN_DECLARE_TEST(mixingtypes) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(mixingtypes<3>()); diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp index a419b0e44..4b5fc21f2 100644 --- a/test/nesting_ops.cpp +++ b/test/nesting_ops.cpp @@ -91,7 +91,7 @@ template void run_nesting_ops_2(const MatrixType& _m) } -void test_nesting_ops() +EIGEN_DECLARE_TEST(nesting_ops) { CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25))); CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25))); diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index d1ca79d68..cb4c073e9 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -202,7 +202,7 @@ template void test_reference(const MatrixType& m) { } -void test_nomalloc() +EIGEN_DECLARE_TEST(nomalloc) { // create some dynamic objects Eigen::MatrixXd M1 = MatrixXd::Random(3,3); diff --git a/test/nullary.cpp b/test/nullary.cpp index 22ec92352..9e6e6eaa0 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -239,7 +239,7 @@ void testMatrixType(const MatrixType& m) VERIFY_IS_APPROX( A(i,j), s1 ); } -void test_nullary() +EIGEN_DECLARE_TEST(nullary) { CALL_SUBTEST_1( testMatrixType(Matrix2d()) ); CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random(1,300),internal::random(1,300))) ); diff --git a/test/num_dimensions.cpp b/test/num_dimensions.cpp index f5209283d..7ad7ef697 100644 --- a/test/num_dimensions.cpp +++ b/test/num_dimensions.cpp @@ -60,7 +60,7 @@ using TMatrix = Matrix; #endif -void test_num_dimensions() +EIGEN_DECLARE_TEST(num_dimensions) { int n = 10; ArrayXXd A(n,n); diff --git a/test/numext.cpp b/test/numext.cpp index 3de33e2f9..6307f5979 100644 --- a/test/numext.cpp +++ b/test/numext.cpp @@ -33,7 +33,7 @@ void check_abs() { } } -void test_numext() { +EIGEN_DECLARE_TEST(numext) { CALL_SUBTEST( check_abs() ); CALL_SUBTEST( check_abs() ); CALL_SUBTEST( check_abs() ); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 56e017383..58a1c60bf 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -627,7 +627,7 @@ template void packetmath_scatter_gather() } } -void test_packetmath() +EIGEN_DECLARE_TEST(packetmath) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( packetmath() ); diff --git a/test/pardiso_support.cpp b/test/pardiso_support.cpp index 67efad6d8..9c16ded5b 100644 --- a/test/pardiso_support.cpp +++ b/test/pardiso_support.cpp @@ -20,7 +20,7 @@ template void test_pardiso_T() check_sparse_square_solving(pardiso_lu); } -void test_pardiso_support() +EIGEN_DECLARE_TEST(pardiso_support) { CALL_SUBTEST_1(test_pardiso_T()); CALL_SUBTEST_2(test_pardiso_T()); diff --git a/test/pastix_support.cpp b/test/pastix_support.cpp index b62f85739..9b64417c1 100644 --- a/test/pastix_support.cpp +++ b/test/pastix_support.cpp @@ -45,7 +45,7 @@ template void test_pastix_T_LU() check_sparse_square_solving(pastix_lu); } -void test_pastix_support() +EIGEN_DECLARE_TEST(pastix_support) { CALL_SUBTEST_1(test_pastix_T()); CALL_SUBTEST_2(test_pastix_T()); diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp index e885f0e04..71f09be0f 100644 --- a/test/permutationmatrices.cpp +++ b/test/permutationmatrices.cpp @@ -152,7 +152,7 @@ void bug890() VERIFY_IS_APPROX(v1, (P.inverse() * rhs).eval()); } -void test_permutationmatrices() +EIGEN_DECLARE_TEST(permutationmatrices) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( permutationmatrices(Matrix()) ); diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index eb6ad18c9..072466467 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -68,7 +68,7 @@ template void inverse_general_4x4(int repeat) } } -void test_prec_inverse_4x4() +EIGEN_DECLARE_TEST(prec_inverse_4x4) { CALL_SUBTEST_1((inverse_permutation_4x4())); CALL_SUBTEST_1(( inverse_general_4x4(200000 * g_repeat) )); diff --git a/test/product_extra.cpp b/test/product_extra.cpp index de2709d8b..bd31df84d 100644 --- a/test/product_extra.cpp +++ b/test/product_extra.cpp @@ -352,7 +352,7 @@ void bug_1308() VERIFY_IS_APPROX(r44.noalias() += Vector4d::Ones() * m44.col(0).transpose(), ones44); } -void test_product_extra() +EIGEN_DECLARE_TEST(product_extra) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product_extra(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); diff --git a/test/product_large.cpp b/test/product_large.cpp index 14a4f739d..1b6fec738 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -30,7 +30,7 @@ void test_aliasing() x = z; } -void test_product_large() +EIGEN_DECLARE_TEST(product_large) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); diff --git a/test/product_mmtr.cpp b/test/product_mmtr.cpp index d3e24b012..bb19e6e52 100644 --- a/test/product_mmtr.cpp +++ b/test/product_mmtr.cpp @@ -84,7 +84,7 @@ template void mmtr(int size) VERIFY_IS_APPROX(matc, ref2); } -void test_product_mmtr() +EIGEN_DECLARE_TEST(product_mmtr) { for(int i = 0; i < g_repeat ; i++) { diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp index b90165b2f..dffb07608 100644 --- a/test/product_notemporary.cpp +++ b/test/product_notemporary.cpp @@ -150,7 +150,7 @@ template void product_notemporary(const MatrixType& m) VERIFY_EVALUATION_COUNT( rvres.noalias() = rv1 * (m1 * m2.adjoint()), 1 ); } -void test_product_notemporary() +EIGEN_DECLARE_TEST(product_notemporary) { int s; for(int i = 0; i < g_repeat; i++) { diff --git a/test/product_selfadjoint.cpp b/test/product_selfadjoint.cpp index 88d68391b..bdccd0491 100644 --- a/test/product_selfadjoint.cpp +++ b/test/product_selfadjoint.cpp @@ -59,7 +59,7 @@ template void product_selfadjoint(const MatrixType& m) } } -void test_product_selfadjoint() +EIGEN_DECLARE_TEST(product_selfadjoint) { int s = 0; for(int i = 0; i < g_repeat ; i++) { diff --git a/test/product_small.cpp b/test/product_small.cpp index fdfdd9f6c..16138631a 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -228,7 +228,7 @@ void bug_1311() VERIFY_IS_APPROX(res, A*b); } -void test_product_small() +EIGEN_DECLARE_TEST(product_small) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(Matrix()) ); diff --git a/test/product_symm.cpp b/test/product_symm.cpp index 7d1042a4f..7d786467d 100644 --- a/test/product_symm.cpp +++ b/test/product_symm.cpp @@ -94,7 +94,7 @@ template void symm(int size = Size, in } -void test_product_symm() +EIGEN_DECLARE_TEST(product_symm) { for(int i = 0; i < g_repeat ; i++) { diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp index 3ebbe14ca..23406fe4b 100644 --- a/test/product_syrk.cpp +++ b/test/product_syrk.cpp @@ -117,7 +117,7 @@ template void syrk(const MatrixType& m) ((s1 * m1.row(c).adjoint() * m1.row(c).adjoint().adjoint()).eval().template triangularView().toDenseMatrix())); } -void test_product_syrk() +EIGEN_DECLARE_TEST(product_syrk) { for(int i = 0; i < g_repeat ; i++) { diff --git a/test/product_trmm.cpp b/test/product_trmm.cpp index e08d9f39f..c7594e512 100644 --- a/test/product_trmm.cpp +++ b/test/product_trmm.cpp @@ -115,7 +115,7 @@ void trmm(int rows=get_random_size(), int cols=get_random_size() CALL_ALL_ORDERS(EIGEN_CAT(3,NB),SCALAR,StrictlyLower) -void test_product_trmm() +EIGEN_DECLARE_TEST(product_trmm) { for(int i = 0; i < g_repeat ; i++) { diff --git a/test/product_trmv.cpp b/test/product_trmv.cpp index 65d66e57b..5eb1b5ac0 100644 --- a/test/product_trmv.cpp +++ b/test/product_trmv.cpp @@ -70,7 +70,7 @@ template void trmv(const MatrixType& m) // TODO check with sub-matrices } -void test_product_trmv() +EIGEN_DECLARE_TEST(product_trmv) { int s = 0; for(int i = 0; i < g_repeat ; i++) { diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp index 4b97fa9d6..0c22cccf6 100644 --- a/test/product_trsolve.cpp +++ b/test/product_trsolve.cpp @@ -73,7 +73,7 @@ template void trsolve(int size=Size,int cols VERIFY_TRSM(cmLhs.template triangularView(), rmRhs.col(c)); } -void test_product_trsolve() +EIGEN_DECLARE_TEST(product_trsolve) { for(int i = 0; i < g_repeat ; i++) { diff --git a/test/qr.cpp b/test/qr.cpp index 02e3ed74c..4799aa9ef 100644 --- a/test/qr.cpp +++ b/test/qr.cpp @@ -100,7 +100,7 @@ template void qr_verify_assert() VERIFY_RAISES_ASSERT(qr.logAbsDeterminant()) } -void test_qr() +EIGEN_DECLARE_TEST(qr) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( qr(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE),internal::random(1,EIGEN_TEST_MAX_SIZE))) ); diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp index 96c0badb7..d224a9436 100644 --- a/test/qr_colpivoting.cpp +++ b/test/qr_colpivoting.cpp @@ -296,7 +296,7 @@ template void qr_verify_assert() VERIFY_RAISES_ASSERT(qr.logAbsDeterminant()) } -void test_qr_colpivoting() +EIGEN_DECLARE_TEST(qr_colpivoting) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( qr() ); diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp index c270edf1a..150b4256c 100644 --- a/test/qr_fullpivoting.cpp +++ b/test/qr_fullpivoting.cpp @@ -125,7 +125,7 @@ template void qr_verify_assert() VERIFY_RAISES_ASSERT(qr.logAbsDeterminant()) } -void test_qr_fullpivoting() +EIGEN_DECLARE_TEST(qr_fullpivoting) { for(int i = 0; i < 1; i++) { CALL_SUBTEST_5( qr() ); diff --git a/test/qtvector.cpp b/test/qtvector.cpp index 22df0d515..4ec79b1e6 100644 --- a/test/qtvector.cpp +++ b/test/qtvector.cpp @@ -125,7 +125,7 @@ void check_qtvector_quaternion(const QuaternionType&) } } -void test_qtvector() +EIGEN_DECLARE_TEST(qtvector) { // some non vectorizable fixed sizes CALL_SUBTEST(check_qtvector_matrix(Vector2f())); diff --git a/test/rand.cpp b/test/rand.cpp index 51cf01773..1b5c058ab 100644 --- a/test/rand.cpp +++ b/test/rand.cpp @@ -54,7 +54,7 @@ template void check_histogram(Scalar x, Scalar y, int bins) VERIFY( (((hist.cast()/double(f))-1.0).abs()<0.02).all() ); } -void test_rand() +EIGEN_DECLARE_TEST(rand) { long long_ref = NumTraits::highest()/10; signed char char_offset = (std::min)(g_repeat,64); diff --git a/test/real_qz.cpp b/test/real_qz.cpp index 3c1492e4b..1cf7aba2d 100644 --- a/test/real_qz.cpp +++ b/test/real_qz.cpp @@ -75,7 +75,7 @@ template void real_qz(const MatrixType& m) VERIFY_IS_APPROX(qz.matrixZ()*qz.matrixZ().adjoint(), MatrixType::Identity(dim,dim)); } -void test_real_qz() +EIGEN_DECLARE_TEST(real_qz) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/redux.cpp b/test/redux.cpp index 213f080aa..9e3ed4546 100644 --- a/test/redux.cpp +++ b/test/redux.cpp @@ -146,7 +146,7 @@ template void vectorRedux(const VectorType& w) VERIFY_RAISES_ASSERT(v.head(0).maxCoeff()); } -void test_redux() +EIGEN_DECLARE_TEST(redux) { // the max size cannot be too large, otherwise reduxion operations obviously generate large errors. int maxsize = (std::min)(100,EIGEN_TEST_MAX_SIZE); diff --git a/test/ref.cpp b/test/ref.cpp index 704495aff..a94ea9677 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -264,7 +264,7 @@ void test_ref_fixed_size_assert() VERIFY_RAISES_STATIC_ASSERT( Ref y = 2*v4; (void)y; ); } -void test_ref() +EIGEN_DECLARE_TEST(ref) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( ref_vector(Matrix()) ); diff --git a/test/resize.cpp b/test/resize.cpp index 4adaafe56..646a75b8f 100644 --- a/test/resize.cpp +++ b/test/resize.cpp @@ -33,7 +33,7 @@ void resizeLikeTest12() { resizeLikeTest<1,2>(); } void resizeLikeTest1020() { resizeLikeTest<10,20>(); } void resizeLikeTest31() { resizeLikeTest<3,1>(); } -void test_resize() +EIGEN_DECLARE_TEST(resize) { CALL_SUBTEST(resizeLikeTest12() ); CALL_SUBTEST(resizeLikeTest1020() ); diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp index 8887f1b1b..5f52fb3bc 100644 --- a/test/rvalue_types.cpp +++ b/test/rvalue_types.cpp @@ -43,7 +43,7 @@ template void rvalue_copyassign(const MatrixType&) {} #endif -void test_rvalue_types() +EIGEN_DECLARE_TEST(rvalue_types) { CALL_SUBTEST_1(rvalue_copyassign( MatrixXf::Random(50,50).eval() )); CALL_SUBTEST_1(rvalue_copyassign( ArrayXXf::Random(50,50).eval() )); diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp index deb78e44e..03e17e81d 100644 --- a/test/schur_complex.cpp +++ b/test/schur_complex.cpp @@ -79,7 +79,7 @@ template void schur(int size = MatrixType::ColsAtCompileTim } } -void test_schur_complex() +EIGEN_DECLARE_TEST(schur_complex) { CALL_SUBTEST_1(( schur() )); CALL_SUBTEST_2(( schur(internal::random(1,EIGEN_TEST_MAX_SIZE/4)) )); diff --git a/test/schur_real.cpp b/test/schur_real.cpp index e5229e6e8..945461027 100644 --- a/test/schur_real.cpp +++ b/test/schur_real.cpp @@ -98,7 +98,7 @@ template void schur(int size = MatrixType::ColsAtCompileTim } } -void test_schur_real() +EIGEN_DECLARE_TEST(schur_real) { CALL_SUBTEST_1(( schur() )); CALL_SUBTEST_2(( schur(internal::random(1,EIGEN_TEST_MAX_SIZE/4)) )); diff --git a/test/selfadjoint.cpp b/test/selfadjoint.cpp index bb11cc351..9ca9cef9e 100644 --- a/test/selfadjoint.cpp +++ b/test/selfadjoint.cpp @@ -56,7 +56,7 @@ void bug_159() EIGEN_UNUSED_VARIABLE(m) } -void test_selfadjoint() +EIGEN_DECLARE_TEST(selfadjoint) { for(int i = 0; i < g_repeat ; i++) { diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp index 5bb1e0d6c..314b903e2 100644 --- a/test/simplicial_cholesky.cpp +++ b/test/simplicial_cholesky.cpp @@ -39,7 +39,7 @@ template void test_simplicial_cholesky_T() check_sparse_spd_solving(ldlt_colmajor_upper_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000); } -void test_simplicial_cholesky() +EIGEN_DECLARE_TEST(simplicial_cholesky) { CALL_SUBTEST_1(( test_simplicial_cholesky_T() )); CALL_SUBTEST_2(( test_simplicial_cholesky_T, int>() )); diff --git a/test/sizeof.cpp b/test/sizeof.cpp index e552bae7a..af34e97dd 100644 --- a/test/sizeof.cpp +++ b/test/sizeof.cpp @@ -18,7 +18,7 @@ template void verifySizeOf(const MatrixType&) VERIFY_IS_EQUAL(sizeof(MatrixType),sizeof(Scalar*) + 2 * sizeof(Index)); } -void test_sizeof() +EIGEN_DECLARE_TEST(sizeof) { CALL_SUBTEST(verifySizeOf(Matrix()) ); CALL_SUBTEST(verifySizeOf(Array()) ); diff --git a/test/sizeoverflow.cpp b/test/sizeoverflow.cpp index 240d22294..421351233 100644 --- a/test/sizeoverflow.cpp +++ b/test/sizeoverflow.cpp @@ -34,7 +34,7 @@ void triggerVectorBadAlloc(Index size) VERIFY_THROWS_BADALLOC( VectorType v; v.conservativeResize(size) ); } -void test_sizeoverflow() +EIGEN_DECLARE_TEST(sizeoverflow) { // there are 2 levels of overflow checking. first in PlainObjectBase.h we check for overflow in rows*cols computations. // this is tested in tests of the form times_itself_gives_0 * times_itself_gives_0 diff --git a/test/smallvectors.cpp b/test/smallvectors.cpp index 781511397..f9803acbb 100644 --- a/test/smallvectors.cpp +++ b/test/smallvectors.cpp @@ -57,7 +57,7 @@ template void smallVectors() } } -void test_smallvectors() +EIGEN_DECLARE_TEST(smallvectors) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST(smallVectors() ); diff --git a/test/sparseLM.cpp b/test/sparseLM.cpp index 8e148f9bc..a48fcb685 100644 --- a/test/sparseLM.cpp +++ b/test/sparseLM.cpp @@ -168,7 +168,7 @@ void test_sparseLM_T() return ; } -void test_sparseLM() +EIGEN_DECLARE_TEST(sparseLM) { CALL_SUBTEST_1(test_sparseLM_T()); diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index f84b6e3f5..bfd61f69f 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -641,7 +641,7 @@ void big_sparse_triplet(Index rows, Index cols, double density) { } -void test_sparse_basic() +EIGEN_DECLARE_TEST(sparse_basic) { for(int i = 0; i < g_repeat; i++) { int r = Eigen::internal::random(1,200), c = Eigen::internal::random(1,200); diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp index 2a0b3b617..e7303eab0 100644 --- a/test/sparse_block.cpp +++ b/test/sparse_block.cpp @@ -288,7 +288,7 @@ template void sparse_block(const SparseMatrixType& re } } -void test_sparse_block() +EIGEN_DECLARE_TEST(sparse_block) { for(int i = 0; i < g_repeat; i++) { int r = Eigen::internal::random(1,200), c = Eigen::internal::random(1,200); diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp index b82cceff8..e93493c39 100644 --- a/test/sparse_permutations.cpp +++ b/test/sparse_permutations.cpp @@ -220,7 +220,7 @@ template void sparse_permutations_all(int size) CALL_SUBTEST(( sparse_permutations(SparseMatrix(size,size)) )); } -void test_sparse_permutations() +EIGEN_DECLARE_TEST(sparse_permutations) { for(int i = 0; i < g_repeat; i++) { int s = Eigen::internal::random(1,50); diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp index f47170b72..d6b7638b1 100644 --- a/test/sparse_product.cpp +++ b/test/sparse_product.cpp @@ -453,7 +453,7 @@ void test_mixing_types() VERIFY_IS_APPROX( dC2 = sC1 * dR1.col(0), dC3 = sC1 * dR1.template cast().col(0) ); } -void test_sparse_product() +EIGEN_DECLARE_TEST(sparse_product) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( (sparse_product >()) ); diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index 5e9607234..12b6f8a9d 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -126,7 +126,7 @@ void call_ref() VERIFY_EVALUATION_COUNT( call_ref_5(A.row(2), A.row(2).transpose()), 1); } -void test_sparse_ref() +EIGEN_DECLARE_TEST(sparse_ref) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_const_correctness(SparseMatrix()) ); diff --git a/test/sparse_solvers.cpp b/test/sparse_solvers.cpp index 3a8873d43..aaf3d39c9 100644 --- a/test/sparse_solvers.cpp +++ b/test/sparse_solvers.cpp @@ -101,7 +101,7 @@ template void sparse_solvers(int rows, int cols) } } -void test_sparse_solvers() +EIGEN_DECLARE_TEST(sparse_solvers) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(sparse_solvers(8, 8) ); diff --git a/test/sparse_vector.cpp b/test/sparse_vector.cpp index b3e1dda25..35129278b 100644 --- a/test/sparse_vector.cpp +++ b/test/sparse_vector.cpp @@ -145,7 +145,7 @@ template void sparse_vector(int rows, int } -void test_sparse_vector() +EIGEN_DECLARE_TEST(sparse_vector) { for(int i = 0; i < g_repeat; i++) { int r = Eigen::internal::random(1,500), c = Eigen::internal::random(1,500); diff --git a/test/sparselu.cpp b/test/sparselu.cpp index bd000baf1..84cc6ebe5 100644 --- a/test/sparselu.cpp +++ b/test/sparselu.cpp @@ -36,7 +36,7 @@ template void test_sparselu_T() check_sparse_square_determinant(sparselu_amd); } -void test_sparselu() +EIGEN_DECLARE_TEST(sparselu) { CALL_SUBTEST_1(test_sparselu_T()); CALL_SUBTEST_2(test_sparselu_T()); diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp index f0e721fce..3ffe62314 100644 --- a/test/sparseqr.cpp +++ b/test/sparseqr.cpp @@ -117,7 +117,7 @@ template void test_sparseqr_scalar() dQ = solver.matrixQ(); VERIFY_IS_APPROX(Q, dQ); } -void test_sparseqr() +EIGEN_DECLARE_TEST(sparseqr) { for(int i=0; i void special_numbers() VERIFY(!mboth.array().allFinite()); } -void test_special_numbers() +EIGEN_DECLARE_TEST(special_numbers) { for(int i = 0; i < 10*g_repeat; i++) { CALL_SUBTEST_1( special_numbers() ); diff --git a/test/spqr_support.cpp b/test/spqr_support.cpp index 81e63b6a5..79c2c12fc 100644 --- a/test/spqr_support.cpp +++ b/test/spqr_support.cpp @@ -57,7 +57,7 @@ template void test_spqr_scalar() refX = dA.colPivHouseholderQr().solve(b); VERIFY(x.isApprox(refX,test_precision())); } -void test_spqr_support() +EIGEN_DECLARE_TEST(spqr_support) { CALL_SUBTEST_1(test_spqr_scalar()); CALL_SUBTEST_2(test_spqr_scalar >()); diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 255cae9b3..ee5f91674 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -220,7 +220,7 @@ void test_hypot() VERIFY((numext::isnan)(numext::hypot(a,nan))); } -void test_stable_norm() +EIGEN_DECLARE_TEST(stable_norm) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_3( test_hypot() ); diff --git a/test/stddeque.cpp b/test/stddeque.cpp index b511c4e61..0a6aa2572 100644 --- a/test/stddeque.cpp +++ b/test/stddeque.cpp @@ -100,7 +100,7 @@ void check_stddeque_quaternion(const QuaternionType&) VERIFY_IS_APPROX(v.back(), x); } -void test_stddeque() +EIGEN_DECLARE_TEST(stddeque) { // some non vectorizable fixed sizes CALL_SUBTEST_1(check_stddeque_matrix(Vector2f())); diff --git a/test/stddeque_overload.cpp b/test/stddeque_overload.cpp index 0541bc1bd..eebe93d81 100644 --- a/test/stddeque_overload.cpp +++ b/test/stddeque_overload.cpp @@ -128,7 +128,7 @@ void check_stddeque_quaternion(const QuaternionType&) } } -void test_stddeque_overload() +EIGEN_DECLARE_TEST(stddeque_overload) { // some non vectorizable fixed sizes CALL_SUBTEST_1(check_stddeque_matrix(Vector2f())); diff --git a/test/stdlist.cpp b/test/stdlist.cpp index 23cbe9039..c9e5b7286 100644 --- a/test/stdlist.cpp +++ b/test/stdlist.cpp @@ -100,7 +100,7 @@ void check_stdlist_quaternion(const QuaternionType&) VERIFY_IS_APPROX(v.back(), x); } -void test_stdlist() +EIGEN_DECLARE_TEST(stdlist) { // some non vectorizable fixed sizes CALL_SUBTEST_1(check_stdlist_matrix(Vector2f())); diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp index 386124ad4..93b6fc9ed 100644 --- a/test/stdlist_overload.cpp +++ b/test/stdlist_overload.cpp @@ -162,7 +162,7 @@ void check_stdlist_quaternion(const QuaternionType&) } } -void test_stdlist_overload() +EIGEN_DECLARE_TEST(stdlist_overload) { // some non vectorizable fixed sizes CALL_SUBTEST_1(check_stdlist_matrix(Vector2f())); diff --git a/test/stdvector.cpp b/test/stdvector.cpp index cf3646870..e2b7bd061 100644 --- a/test/stdvector.cpp +++ b/test/stdvector.cpp @@ -117,7 +117,7 @@ void check_stdvector_quaternion(const QuaternionType&) } } -void test_stdvector() +EIGEN_DECLARE_TEST(stdvector) { // some non vectorizable fixed sizes CALL_SUBTEST_1(check_stdvector_matrix(Vector2f())); diff --git a/test/stdvector_overload.cpp b/test/stdvector_overload.cpp index 0c86ce8a4..5c042a64c 100644 --- a/test/stdvector_overload.cpp +++ b/test/stdvector_overload.cpp @@ -131,7 +131,7 @@ void check_stdvector_quaternion(const QuaternionType&) } } -void test_stdvector_overload() +EIGEN_DECLARE_TEST(stdvector_overload) { // some non vectorizable fixed sizes CALL_SUBTEST_1(check_stdvector_matrix(Vector2f())); diff --git a/test/superlu_support.cpp b/test/superlu_support.cpp index 98a7bc5c8..55450c868 100644 --- a/test/superlu_support.cpp +++ b/test/superlu_support.cpp @@ -12,7 +12,7 @@ #include -void test_superlu_support() +EIGEN_DECLARE_TEST(superlu_support) { SuperLU > superlu_double_colmajor; SuperLU > > superlu_cplxdouble_colmajor; diff --git a/test/swap.cpp b/test/swap.cpp index a294da719..5b259d3ec 100644 --- a/test/swap.cpp +++ b/test/swap.cpp @@ -83,7 +83,7 @@ template void swap(const MatrixType& m) } } -void test_swap() +EIGEN_DECLARE_TEST(swap) { int s = internal::random(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_1( swap(Matrix3f()) ); // fixed size, no vectorization diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp index 4c079857b..cf89b9fbe 100644 --- a/test/symbolic_index.cpp +++ b/test/symbolic_index.cpp @@ -113,7 +113,7 @@ void check_symbolic_index() #endif } -void test_symbolic_index() +EIGEN_DECLARE_TEST(symbolic_index) { CALL_SUBTEST_1( check_symbolic_index() ); CALL_SUBTEST_2( check_symbolic_index() ); diff --git a/test/triangular.cpp b/test/triangular.cpp index d50342ccc..73b6fc46b 100644 --- a/test/triangular.cpp +++ b/test/triangular.cpp @@ -220,7 +220,7 @@ void bug_159() EIGEN_UNUSED_VARIABLE(m) } -void test_triangular() +EIGEN_DECLARE_TEST(triangular) { int maxsize = (std::min)(EIGEN_TEST_MAX_SIZE,20); for(int i = 0; i < g_repeat ; i++) diff --git a/test/umeyama.cpp b/test/umeyama.cpp index 2e8092434..1590a0a81 100644 --- a/test/umeyama.cpp +++ b/test/umeyama.cpp @@ -155,7 +155,7 @@ void run_fixed_size_test(int num_elements) VERIFY(error < Scalar(16)*std::numeric_limits::epsilon()); } -void test_umeyama() +EIGEN_DECLARE_TEST(umeyama) { for (int i=0; i void test_umfpack_support_T() check_sparse_square_determinant(umfpack_rowmajor); } -void test_umfpack_support() +EIGEN_DECLARE_TEST(umfpack_support) { CALL_SUBTEST_1((test_umfpack_support_T())); CALL_SUBTEST_2((test_umfpack_support_T, int>())); diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index 731a08977..120cc42bb 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -174,7 +174,7 @@ void unalignedassert() #endif } -void test_unalignedassert() +EIGEN_DECLARE_TEST(unalignedassert) { CALL_SUBTEST(unalignedassert()); } diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp index d6ffeafdf..069fc1bb9 100644 --- a/test/unalignedcount.cpp +++ b/test/unalignedcount.cpp @@ -28,7 +28,7 @@ static int nb_storeu; #include "main.h" -void test_unalignedcount() +EIGEN_DECLARE_TEST(unalignedcount) { #if defined(EIGEN_VECTORIZE_AVX) VectorXf a(40), b(40); diff --git a/test/upperbidiagonalization.cpp b/test/upperbidiagonalization.cpp index 27a747875..945c99959 100644 --- a/test/upperbidiagonalization.cpp +++ b/test/upperbidiagonalization.cpp @@ -29,7 +29,7 @@ template void upperbidiag(const MatrixType& m) VERIFY_IS_APPROX(a.adjoint(),d); } -void test_upperbidiagonalization() +EIGEN_DECLARE_TEST(upperbidiagonalization) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( upperbidiag(MatrixXf(3,3)) ); diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 37e7495f5..01b55e192 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -386,7 +386,7 @@ template struct vectorization_logic_half static void run() {} }; -void test_vectorization_logic() +EIGEN_DECLARE_TEST(vectorization_logic) { #ifdef EIGEN_VECTORIZE diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index a099d17c8..2d7ddbed1 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -237,7 +237,7 @@ template void vectorwiseop_matrix(const MatrixType& m) VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) ); } -void test_vectorwiseop() +EIGEN_DECLARE_TEST(vectorwiseop) { CALL_SUBTEST_1( vectorwiseop_array(Array22cd()) ); CALL_SUBTEST_2( vectorwiseop_array(Array()) ); diff --git a/test/visitor.cpp b/test/visitor.cpp index 7f4efab97..de938fc95 100644 --- a/test/visitor.cpp +++ b/test/visitor.cpp @@ -113,7 +113,7 @@ template void vectorVisitor(const VectorType& w) VERIFY(eigen_maxidx == (std::min)(idx0,idx2)); } -void test_visitor() +EIGEN_DECLARE_TEST(visitor) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( matrixVisitor(Matrix()) ); diff --git a/test/zerosized.cpp b/test/zerosized.cpp index 477ff0070..edd1f6925 100644 --- a/test/zerosized.cpp +++ b/test/zerosized.cpp @@ -81,7 +81,7 @@ template void zeroSizedVector() } } -void test_zerosized() +EIGEN_DECLARE_TEST(zerosized) { zeroSizedMatrix(); zeroSizedMatrix(); diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h index 5f64501be..62561da1d 100644 --- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h @@ -117,7 +117,7 @@ class LevenbergMarquardt : internal::no_assignment_operator typedef typename JacobianType::RealScalar RealScalar; typedef typename QRSolver::StorageIndex PermIndex; typedef Matrix FVectorType; - typedef PermutationMatrix PermutationType; + typedef PermutationMatrix PermutationType; public: LevenbergMarquardt(FunctorType& functor) : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0), diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp index ff5b3299d..d8c39d556 100644 --- a/unsupported/test/BVH.cpp +++ b/unsupported/test/BVH.cpp @@ -192,7 +192,7 @@ struct TreeTest }; -void test_BVH() +EIGEN_DECLARE_TEST(BVH) { for(int i = 0; i < g_repeat; i++) { #ifdef EIGEN_TEST_PART_1 diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index 500fb2d17..572fc08a3 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -272,7 +272,7 @@ template void eulerangles_rand() check_all_var(ea); } -void test_EulerAngles() +EIGEN_DECLARE_TEST(EulerAngles) { // Simple cast test EulerAnglesXYZd onesEd(1, 1, 1); diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp index 8b7528fb7..cfe559ebd 100644 --- a/unsupported/test/FFTW.cpp +++ b/unsupported/test/FFTW.cpp @@ -225,7 +225,7 @@ void test_return_by_value(int len) VERIFY( (in1-in).norm() < test_precision() ); } -void test_FFTW() +EIGEN_DECLARE_TEST(FFTW) { CALL_SUBTEST( test_return_by_value(32) ); //CALL_SUBTEST( ( test_complex2d () ) ); CALL_SUBTEST( ( test_complex2d () ) ); diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp index 1d682dd83..cc95cea29 100644 --- a/unsupported/test/NonLinearOptimization.cpp +++ b/unsupported/test/NonLinearOptimization.cpp @@ -1818,7 +1818,7 @@ void testNistEckerle4(void) VERIFY_IS_APPROX(x[2], 4.5154121844E+02); } -void test_NonLinearOptimization() +EIGEN_DECLARE_TEST(NonLinearOptimization) { // Tests using the examples provided by (c)minpack CALL_SUBTEST/*_1*/(testChkder()); diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp index 27d888056..35f2f6d7c 100644 --- a/unsupported/test/NumericalDiff.cpp +++ b/unsupported/test/NumericalDiff.cpp @@ -107,7 +107,7 @@ void test_central() VERIFY_IS_APPROX(jac, actual_jac); } -void test_NumericalDiff() +EIGEN_DECLARE_TEST(NumericalDiff) { CALL_SUBTEST(test_forward()); CALL_SUBTEST(test_central()); diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp index 252cb1d3f..fcc89daab 100644 --- a/unsupported/test/alignedvector3.cpp +++ b/unsupported/test/alignedvector3.cpp @@ -76,7 +76,7 @@ void alignedvector3() VERIFY(ss1.str()==ss2.str()); } -void test_alignedvector3() +EIGEN_DECLARE_TEST(alignedvector3) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST( alignedvector3() ); diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 1c5e0dc66..495cee03e 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -354,7 +354,7 @@ double bug_1264() { #endif -void test_autodiff() +EIGEN_DECLARE_TEST(autodiff) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( test_autodiff_scalar<1>() ); diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index 1d4418203..e81a7788b 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -89,7 +89,7 @@ void check_limits_specialization() #endif } -void test_autodiff_scalar() +EIGEN_DECLARE_TEST(autodiff_scalar) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_atan2() ); diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 3b598bf42..2f1418684 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -135,7 +135,7 @@ static void test_stress_eventcount() } } -void test_cxx11_eventcount() +EIGEN_DECLARE_TEST(cxx11_eventcount) { CALL_SUBTEST(test_basic_eventcount()); CALL_SUBTEST(test_stress_eventcount()); diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index 8911c59d8..510e11032 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -340,7 +340,7 @@ static void test_array_misc() VERIFY_IS_EQUAL((instantiate_by_c_array(data).c), 5); } -void test_cxx11_meta() +EIGEN_DECLARE_TEST(cxx11_meta) { CALL_SUBTEST(test_gen_numeric_list()); CALL_SUBTEST(test_concat()); diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 48cd2d4e4..5a8080ea3 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -116,7 +116,7 @@ static void test_cancel() tp.Cancel(); } -void test_cxx11_non_blocking_thread_pool() +EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool) { CALL_SUBTEST(test_create_destroy_empty_pool()); CALL_SUBTEST(test_parallelism(true)); diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 91f690114..8fc5a3074 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -227,7 +227,7 @@ void test_stress_runqueue() VERIFY(total.load() == 0); } -void test_cxx11_runqueue() +EIGEN_DECLARE_TEST(cxx11_runqueue) { CALL_SUBTEST_1(test_basic_runqueue()); CALL_SUBTEST_2(test_empty_runqueue()); diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp index 037767270..4a0c8967b 100644 --- a/unsupported/test/cxx11_tensor_argmax.cpp +++ b/unsupported/test/cxx11_tensor_argmax.cpp @@ -273,7 +273,7 @@ static void test_argmin_dim() } } -void test_cxx11_tensor_argmax() +EIGEN_DECLARE_TEST(cxx11_tensor_argmax) { CALL_SUBTEST(test_simple_index_tuples()); CALL_SUBTEST(test_simple_index_tuples()); diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu index 541a27865..f6c3a9908 100644 --- a/unsupported/test/cxx11_tensor_argmax_gpu.cu +++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_gpu + #define EIGEN_USE_GPU #include "main.h" diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index 521a7f82c..0bbb0f6dc 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_argmax_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -237,7 +237,7 @@ template void sycl_argmax_test_per_ test_sycl_argmin_dim(sycl_device); } -void test_cxx11_tensor_argmax_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_argmax_test_per_device(device)); } diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index 8fe85d83c..ce9d24369 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -358,7 +358,7 @@ static void test_std_initializers_tensor() { #endif // EIGEN_HAS_VARIADIC_TEMPLATES } -void test_cxx11_tensor_assign() +EIGEN_DECLARE_TEST(cxx11_tensor_assign) { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index 21fdfca22..20f84b8e0 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -137,7 +137,7 @@ template void sycl_broadcast_test_per_device(const cl::sycl:: test_broadcast_sycl_fixed(sycl_device); } -void test_cxx11_tensor_broadcast_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_broadcast_test_per_device(device)); } diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index f0ff03184..2f8ab6afd 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -292,7 +292,7 @@ static void test_simple_broadcasting_one_by_n_by_one_2d() } } -void test_cxx11_tensor_broadcasting() +EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting) { CALL_SUBTEST(test_simple_broadcasting()); CALL_SUBTEST(test_simple_broadcasting()); diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index 400a31d09..db2975783 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -257,7 +257,7 @@ static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) } -void test_cxx11_tensor_builtins_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { QueueInterface queueInterface(device); Eigen::SyclDevice sycl_device(&queueInterface); diff --git a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu index a2928b0b3..0a37a555c 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_gpu + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp index 3c6d0d2ff..c4fe9a798 100644 --- a/unsupported/test/cxx11_tensor_casts.cpp +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -105,7 +105,7 @@ static void test_small_to_big_type_cast() } -void test_cxx11_tensor_casts() +EIGEN_DECLARE_TEST(cxx11_tensor_casts) { CALL_SUBTEST(test_simple_cast()); CALL_SUBTEST(test_vectorized_cast()); diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 89cf5c7b7..922274462 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major() VERIFY_IS_EQUAL(chip4.data(), static_cast(0)); } -void test_cxx11_tensor_chipping() +EIGEN_DECLARE_TEST(cxx11_tensor_chipping) { CALL_SUBTEST(test_simple_chip()); CALL_SUBTEST(test_simple_chip()); diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp index 39e4f0a7f..a91efe00c 100644 --- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -15,7 +15,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -614,7 +614,7 @@ template void sycl_chipping_test_per_d test_chip_as_lvalue_sycl(sycl_device); test_chip_as_lvalue_sycl(sycl_device); } -void test_cxx11_tensor_chipping_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_chipping_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp index b1ff8aecb..1a18e07cc 100644 --- a/unsupported/test/cxx11_tensor_comparisons.cpp +++ b/unsupported/test/cxx11_tensor_comparisons.cpp @@ -77,7 +77,7 @@ static void test_equality() } -void test_cxx11_tensor_comparisons() +EIGEN_DECLARE_TEST(cxx11_tensor_comparisons) { CALL_SUBTEST(test_orderings()); CALL_SUBTEST(test_equality()); diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu index 3b078d1d1..aa28457b1 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu @@ -8,7 +8,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops + #define EIGEN_USE_GPU #include "main.h" diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu index 45b49d266..7cf06aa7a 100644 --- a/unsupported/test/cxx11_tensor_complex_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_gpu.cu @@ -8,7 +8,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_complex + #define EIGEN_USE_GPU #include "main.h" diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp index 03ef12e63..9189a609b 100644 --- a/unsupported/test/cxx11_tensor_concatenation.cpp +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -123,7 +123,7 @@ static void test_concatenation_as_lvalue() } -void test_cxx11_tensor_concatenation() +EIGEN_DECLARE_TEST(cxx11_tensor_concatenation) { CALL_SUBTEST(test_dimension_failures()); CALL_SUBTEST(test_dimension_failures()); diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp index e3023a368..765991b35 100644 --- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -173,7 +173,7 @@ template void tensorConcat_perDevice( test_simple_concatenation(sycl_device); test_concatenation_as_lvalue(sycl_device); } -void test_cxx11_tensor_concatenation_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorConcat_perDevice(device)); } diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp index ad9c9da39..9d806ee3c 100644 --- a/unsupported/test/cxx11_tensor_const.cpp +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -55,7 +55,7 @@ static void test_assign_of_const_tensor() } -void test_cxx11_tensor_const() +EIGEN_DECLARE_TEST(cxx11_tensor_const) { CALL_SUBTEST(test_simple_assign()); CALL_SUBTEST(test_assign_of_const_tensor()); diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index 061d0464e..cb1416478 100644 --- a/unsupported/test/cxx11_tensor_contract_gpu.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -10,7 +10,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_gpu + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index 5bace66c5..c8e86e69f 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -283,7 +283,7 @@ template void tensorContractionPerDevice(Dev_selector& s } -void test_cxx11_tensor_contract_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorContractionPerDevice(device)); } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 918c96277..d4cfbd0da 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -559,7 +559,7 @@ static void test_large_contraction_with_output_kernel() { } } -void test_cxx11_tensor_contraction() +EIGEN_DECLARE_TEST(cxx11_tensor_contraction) { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_evals()); diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index e3d4675eb..01bc77bc1 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -136,7 +136,7 @@ static void test_strides() { input(12)*kernel(2))); } -void test_cxx11_tensor_convolution() +EIGEN_DECLARE_TEST(cxx11_tensor_convolution) { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_evals()); diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp index a4226a63a..3954c8a28 100644 --- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp +++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -462,7 +462,7 @@ template void tensorConvolutionPerDevice(Dev_selector& s test_strides(sycl_device); } -void test_cxx11_tensor_convolution_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorConvolutionPerDevice(device)); } diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp index 4528cc176..b5dbc97bd 100644 --- a/unsupported/test/cxx11_tensor_custom_index.cpp +++ b/unsupported/test/cxx11_tensor_custom_index.cpp @@ -88,7 +88,7 @@ static void test_sizes_as_index() } -void test_cxx11_tensor_custom_index() { +EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) { test_map_as_index(); test_map_as_index(); test_matrix_as_index(); diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp index 8baa477cc..875ea57d2 100644 --- a/unsupported/test/cxx11_tensor_custom_op.cpp +++ b/unsupported/test/cxx11_tensor_custom_op.cpp @@ -104,7 +104,7 @@ static void test_custom_binary_op() } -void test_cxx11_tensor_custom_op() +EIGEN_DECLARE_TEST(cxx11_tensor_custom_op) { CALL_SUBTEST(test_custom_unary_op()); CALL_SUBTEST(test_custom_binary_op()); diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp index 9ff287fff..cc3b02448 100644 --- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -158,7 +158,7 @@ template void custom_op_perDevice(Dev test_custom_binary_op_sycl(sycl_device); } -void test_cxx11_tensor_custom_op_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(custom_op_perDevice(device)); } diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index 52215fc39..cd9ba3ecd 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_device + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 3ecc68df0..5095cb078 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -70,7 +70,7 @@ template void sycl_device_test_per_device(const cl::sycl::dev //test_device_exceptions(sycl_device); } -void test_cxx11_tensor_device_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_device_test_per_device(device)); } diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index 16f168ed4..10364d4b4 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -60,7 +60,7 @@ static void test_rank_zero() VERIFY_IS_EQUAL((int)dscalar.rank(), 0); } -void test_cxx11_tensor_dimension() +EIGEN_DECLARE_TEST(cxx11_tensor_dimension) { CALL_SUBTEST(test_dynamic_size()); CALL_SUBTEST(test_fixed_size()); diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp index d7eea42d7..fd889c46c 100644 --- a/unsupported/test/cxx11_tensor_empty.cpp +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor() } -void test_cxx11_tensor_empty() +EIGEN_DECLARE_TEST(cxx11_tensor_empty) { CALL_SUBTEST(test_empty_tensor()); CALL_SUBTEST(test_empty_fixed_size_tensor()); diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index d64301318..30924b6b6 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -366,7 +366,7 @@ static void test_minmax_nan_propagation() test_minmax_nan_propagation_templ(); } -void test_cxx11_tensor_expr() +EIGEN_DECLARE_TEST(cxx11_tensor_expr) { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index ef5f1a312..4e4c9c4ec 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -253,7 +253,7 @@ static void test_fft_non_power_of_2_round_trip(int exponent) { } } -void test_cxx11_tensor_fft() { +EIGEN_DECLARE_TEST(cxx11_tensor_fft) { test_fft_complex_input_golden(); test_fft_real_input_golden(); diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index e6274f8eb..456ce6bea 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -250,7 +250,7 @@ static void test_array() } } -void test_cxx11_tensor_fixed_size() +EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size) { CALL_SUBTEST(test_0d()); CALL_SUBTEST(test_1d()); diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp index 45d7345e9..f76e2ea97 100644 --- a/unsupported/test/cxx11_tensor_forced_eval.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -72,7 +72,7 @@ static void test_const() } -void test_cxx11_tensor_forced_eval() +EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval) { CALL_SUBTEST(test_simple()); CALL_SUBTEST(test_const()); diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index a21514d56..74d38a644 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -69,7 +69,7 @@ template void tensorForced_evalperDev test_forced_eval_sycl(sycl_device); test_forced_eval_sycl(sycl_device); } -void test_cxx11_tensor_forced_eval_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorForced_evalperDevice(device)); } diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp index dcb928714..ee5e29b77 100644 --- a/unsupported/test/cxx11_tensor_generator.cpp +++ b/unsupported/test/cxx11_tensor_generator.cpp @@ -80,7 +80,7 @@ static void test_gaussian() } -void test_cxx11_tensor_generator() +EIGEN_DECLARE_TEST(cxx11_tensor_generator) { CALL_SUBTEST(test_1D()); CALL_SUBTEST(test_1D()); diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp index f551c8d0c..fb6e3d9d0 100644 --- a/unsupported/test/cxx11_tensor_generator_sycl.cpp +++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_generator_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL static const float error_threshold =1e-8f; @@ -139,7 +139,7 @@ template void sycl_generator_test_per_ test_gaussian_sycl(sycl_device); test_gaussian_sycl(sycl_device); } -void test_cxx11_tensor_generator_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_generator_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 285441182..faaac73cf 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_gpu + #define EIGEN_USE_GPU #include "main.h" diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp index 5fd88fa6c..c20edd9ac 100644 --- a/unsupported/test/cxx11_tensor_ifft.cpp +++ b/unsupported/test/cxx11_tensor_ifft.cpp @@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3) } } -void test_cxx11_tensor_ifft() { +EIGEN_DECLARE_TEST(cxx11_tensor_ifft) { CALL_SUBTEST(test_1D_fft_ifft_invariant(4)); CALL_SUBTEST(test_1D_fft_ifft_invariant(16)); CALL_SUBTEST(test_1D_fft_ifft_invariant(32)); diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 105d32fb4..862f1f7f0 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -797,7 +797,7 @@ void test_imagenet_patches() } } -void test_cxx11_tensor_image_patch() +EIGEN_DECLARE_TEST(cxx11_tensor_image_patch) { CALL_SUBTEST_1(test_simple_patch()); CALL_SUBTEST_2(test_patch_no_extra_dim()); diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp index eea18ec70..c1828a0ec 100644 --- a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_image_patch_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -1084,7 +1084,7 @@ test_patch_padding_same_sycl(sycl_device); test_patch_no_extra_dim_sycl(sycl_device); test_imagenet_patches_sycl(sycl_device); } -void test_cxx11_tensor_image_patch_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_tensor_image_patch_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index 4cf5df666..e81fa5e40 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -373,7 +373,7 @@ static void test_dim_check() #endif -void test_cxx11_tensor_index_list() +EIGEN_DECLARE_TEST(cxx11_tensor_index_list) { #ifdef EIGEN_HAS_INDEX_LIST CALL_SUBTEST(test_static_index_list()); diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp index 4997935e9..75089e856 100644 --- a/unsupported/test/cxx11_tensor_inflation.cpp +++ b/unsupported/test/cxx11_tensor_inflation.cpp @@ -74,7 +74,7 @@ static void test_simple_inflation() } } -void test_cxx11_tensor_inflation() +EIGEN_DECLARE_TEST(cxx11_tensor_inflation) { CALL_SUBTEST(test_simple_inflation()); CALL_SUBTEST(test_simple_inflation()); diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp index cf3e29f4c..521ae0cc3 100644 --- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_inflation_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -128,7 +128,7 @@ template void sycl_inflation_test_per_ test_simple_inflation_sycl(sycl_device); test_simple_inflation_sycl(sycl_device); } -void test_cxx11_tensor_inflation_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_inflation_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp index 8e2b70b75..d18a05ec4 100644 --- a/unsupported/test/cxx11_tensor_intdiv.cpp +++ b/unsupported/test/cxx11_tensor_intdiv.cpp @@ -135,7 +135,7 @@ void test_specific() { VERIFY_IS_EQUAL(result, result_op); } -void test_cxx11_tensor_intdiv() +EIGEN_DECLARE_TEST(cxx11_tensor_intdiv) { CALL_SUBTEST_1(test_signed_32bit()); CALL_SUBTEST_2(test_unsigned_32bit()); diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp index 489960529..2c638f9bf 100644 --- a/unsupported/test/cxx11_tensor_io.cpp +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -119,7 +119,7 @@ static void test_output_const() } -void test_cxx11_tensor_io() +EIGEN_DECLARE_TEST(cxx11_tensor_io) { CALL_SUBTEST(test_output_0d()); CALL_SUBTEST(test_output_0d()); diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp index ae297a9da..efb333360 100644 --- a/unsupported/test/cxx11_tensor_layout_swap.cpp +++ b/unsupported/test/cxx11_tensor_layout_swap.cpp @@ -54,7 +54,7 @@ static void test_swap_as_lvalue() } -void test_cxx11_tensor_layout_swap() +EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap) { CALL_SUBTEST(test_simple_swap()); CALL_SUBTEST(test_swap_as_lvalue()); diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp index 9e8db8b4b..9546b911c 100644 --- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp +++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_layout_swap_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -118,7 +118,7 @@ template void sycl_tensor_layout_swap_ test_simple_swap_sycl(sycl_device); test_swap_as_lvalue_sycl(sycl_device); } -void test_cxx11_tensor_layout_swap_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp index 071f5b406..6ba9a212d 100644 --- a/unsupported/test/cxx11_tensor_lvalue.cpp +++ b/unsupported/test/cxx11_tensor_lvalue.cpp @@ -36,7 +36,7 @@ static void test_compound_assignment() } -void test_cxx11_tensor_lvalue() +EIGEN_DECLARE_TEST(cxx11_tensor_lvalue) { CALL_SUBTEST(test_compound_assignment()); } diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index 3db0ee7c0..ce608aca7 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -265,7 +265,7 @@ static void test_casting() VERIFY_IS_EQUAL(sum1, 861); } -void test_cxx11_tensor_map() +EIGEN_DECLARE_TEST(cxx11_tensor_map) { CALL_SUBTEST(test_0d()); CALL_SUBTEST(test_1d()); diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp index 61c742a16..82a1a26d8 100644 --- a/unsupported/test/cxx11_tensor_math.cpp +++ b/unsupported/test/cxx11_tensor_math.cpp @@ -39,7 +39,7 @@ static void test_sigmoid() } -void test_cxx11_tensor_math() +EIGEN_DECLARE_TEST(cxx11_tensor_math) { CALL_SUBTEST(test_tanh()); CALL_SUBTEST(test_sigmoid()); diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp index 4fba6fdd1..ee2616fd7 100644 --- a/unsupported/test/cxx11_tensor_mixed_indices.cpp +++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp @@ -47,7 +47,7 @@ static void test_simple() } -void test_cxx11_tensor_mixed_indices() +EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices) { CALL_SUBTEST(test_simple()); } diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index f7de43110..6365cd89a 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -459,7 +459,7 @@ static void test_composition() } -void test_cxx11_tensor_morphing() +EIGEN_DECLARE_TEST(cxx11_tensor_morphing) { CALL_SUBTEST_1(test_simple_reshape()); CALL_SUBTEST_1(test_reshape_in_expr()); diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index 9b521bc6b..93dabe3ec 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -15,7 +15,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -240,7 +240,7 @@ template void sycl_morphing_test_per_d test_strided_slice_write_sycl(sycl_device); test_strided_slice_write_sycl(sycl_device); } -void test_cxx11_tensor_morphing_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_morphing_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp index 183ef02c1..f63fee013 100644 --- a/unsupported/test/cxx11_tensor_notification.cpp +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -65,7 +65,7 @@ static void test_notification_multiple() VERIFY_IS_EQUAL(counter, 4); } -void test_cxx11_tensor_notification() +EIGEN_DECLARE_TEST(cxx11_tensor_notification) { CALL_SUBTEST(test_notification_single()); CALL_SUBTEST(test_notification_multiple()); diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index e9d1b2d3c..99e18076a 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -94,7 +94,7 @@ static void test_contractions() } -void test_cxx11_tensor_of_complex() +EIGEN_DECLARE_TEST(cxx11_tensor_of_complex) { CALL_SUBTEST(test_additions()); CALL_SUBTEST(test_abs()); diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp index f179a0c21..344d678ef 100644 --- a/unsupported/test/cxx11_tensor_of_const_values.cpp +++ b/unsupported/test/cxx11_tensor_of_const_values.cpp @@ -97,7 +97,7 @@ static void test_plus_equal() } -void test_cxx11_tensor_of_const_values() +EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values) { CALL_SUBTEST(test_assign()); CALL_SUBTEST(test_plus()); diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 150fde8bf..1f1ec26c2 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_gpu + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp index 4ef9aed91..159656276 100644 --- a/unsupported/test/cxx11_tensor_of_strings.cpp +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -141,7 +141,7 @@ static void test_initialization() } -void test_cxx11_tensor_of_strings() +EIGEN_DECLARE_TEST(cxx11_tensor_of_strings) { // Beware: none of this is likely to ever work on a GPU. CALL_SUBTEST(test_assign()); diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index ffa19896e..b8a329deb 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -84,7 +84,7 @@ static void test_padded_expr() } } -void test_cxx11_tensor_padding() +EIGEN_DECLARE_TEST(cxx11_tensor_padding) { CALL_SUBTEST(test_simple_padding()); CALL_SUBTEST(test_simple_padding()); diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp index dc748b73e..727a9ffd7 100644 --- a/unsupported/test/cxx11_tensor_padding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -15,7 +15,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -149,7 +149,7 @@ template void sycl_padding_test_per_de test_padded_expr(sycl_device); } -void test_cxx11_tensor_padding_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_padding_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp index 434359730..498ab8ca7 100644 --- a/unsupported/test/cxx11_tensor_patch.cpp +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -164,7 +164,7 @@ static void test_simple_patch() } } -void test_cxx11_tensor_patch() +EIGEN_DECLARE_TEST(cxx11_tensor_patch) { CALL_SUBTEST(test_simple_patch()); CALL_SUBTEST(test_simple_patch()); diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp index 88a29cb31..7f92bec78 100644 --- a/unsupported/test/cxx11_tensor_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_patch_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -241,7 +241,7 @@ template void sycl_tensor_patch_test_p test_simple_patch_sycl(sycl_device); test_simple_patch_sycl(sycl_device); } -void test_cxx11_tensor_patch_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_tensor_patch_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp index 0f3dc5787..4740d5811 100644 --- a/unsupported/test/cxx11_tensor_random.cpp +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -70,7 +70,7 @@ static void test_custom() } } -void test_cxx11_tensor_random() +EIGEN_DECLARE_TEST(cxx11_tensor_random) { CALL_SUBTEST(test_default()); CALL_SUBTEST(test_normal()); diff --git a/unsupported/test/cxx11_tensor_random_gpu.cu b/unsupported/test/cxx11_tensor_random_gpu.cu index da5977f09..262182d30 100644 --- a/unsupported/test/cxx11_tensor_random_gpu.cu +++ b/unsupported/test/cxx11_tensor_random_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_random_gpu + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 1490ec3da..4c8a95c34 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -484,7 +484,7 @@ static void test_reduce_middle_dims() { } } -void test_cxx11_tensor_reduction() { +EIGEN_DECLARE_TEST(cxx11_tensor_reduction) { CALL_SUBTEST(test_trivial_reductions()); CALL_SUBTEST(test_trivial_reductions()); CALL_SUBTEST(test_simple_reductions()); diff --git a/unsupported/test/cxx11_tensor_reduction_gpu.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu index a36759303..7b8ac2309 100644 --- a/unsupported/test/cxx11_tensor_reduction_gpu.cu +++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_gpu + #define EIGEN_USE_GPU #include "main.h" diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 440d48bca..f526299c6 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -174,7 +174,7 @@ template void sycl_reduction_test_per_device(const cl::sycl:: test_first_dim_reductions_max_sycl(sycl_device); test_last_dim_reductions_sum_sycl(sycl_device); } -void test_cxx11_tensor_reduction_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_reduction_test_per_device(device)); } diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index c8f105e3d..7dbd0478c 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -235,7 +235,7 @@ static void test_nested_ops_with_ref() } -void test_cxx11_tensor_ref() +EIGEN_DECLARE_TEST(cxx11_tensor_ref) { CALL_SUBTEST(test_simple_lvalue_ref()); CALL_SUBTEST(test_simple_rvalue_ref()); diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp index b35b8d29e..5e44ec007 100644 --- a/unsupported/test/cxx11_tensor_reverse.cpp +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue) } -void test_cxx11_tensor_reverse() +EIGEN_DECLARE_TEST(cxx11_tensor_reverse) { CALL_SUBTEST(test_simple_reverse()); CALL_SUBTEST(test_simple_reverse()); diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index 2f5484484..77c2235d1 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -214,7 +214,7 @@ template void sycl_reverse_test_per_device(const cl::sycl::de test_expr_reverse(sycl_device, true); test_expr_reverse(sycl_device, true); } -void test_cxx11_tensor_reverse_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_reverse_test_per_device(device)); } diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp index 2c26151ab..83b592384 100644 --- a/unsupported/test/cxx11_tensor_roundings.cpp +++ b/unsupported/test/cxx11_tensor_roundings.cpp @@ -54,7 +54,7 @@ static void test_float_ceiling() } } -void test_cxx11_tensor_roundings() +EIGEN_DECLARE_TEST(cxx11_tensor_roundings) { CALL_SUBTEST(test_float_rounding()); CALL_SUBTEST(test_float_ceiling()); diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp index af59aa3ef..dccee9e84 100644 --- a/unsupported/test/cxx11_tensor_scan.cpp +++ b/unsupported/test/cxx11_tensor_scan.cpp @@ -98,7 +98,7 @@ static void test_tensor_maps() { } } -void test_cxx11_tensor_scan() { +EIGEN_DECLARE_TEST(cxx11_tensor_scan) { CALL_SUBTEST((test_1d_scan())); CALL_SUBTEST((test_1d_scan())); CALL_SUBTEST((test_1d_scan())); diff --git a/unsupported/test/cxx11_tensor_scan_gpu.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu index 51cd3a3cf..f3e773db5 100644 --- a/unsupported/test/cxx11_tensor_scan_gpu.cu +++ b/unsupported/test/cxx11_tensor_scan_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_scan_gpu + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index d11444a14..ab19b6e6b 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -215,7 +215,7 @@ static void test_shuffle_unshuffle() } -void test_cxx11_tensor_shuffling() +EIGEN_DECLARE_TEST(cxx11_tensor_shuffling) { CALL_SUBTEST(test_simple_shuffling()); CALL_SUBTEST(test_simple_shuffling()); diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index c88db7c72..0e8cc3bd2 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -15,7 +15,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -111,7 +111,7 @@ template void sycl_shuffling_test_per_ test_simple_shuffling_sycl(sycl_device); } -void test_cxx11_tensor_shuffling_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_shuffling_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index 5a0d339ef..6d70f5435 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -316,7 +316,7 @@ static void test_resize() VERIFY_IS_EQUAL(epsilon.size(), 3*5*7); } -void test_cxx11_tensor_simple() +EIGEN_DECLARE_TEST(cxx11_tensor_simple) { CALL_SUBTEST(test_0d()); CALL_SUBTEST(test_1d()); diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp index 935b908cc..aefdfa9b4 100644 --- a/unsupported/test/cxx11_tensor_striding.cpp +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -110,7 +110,7 @@ static void test_striding_as_lvalue() } -void test_cxx11_tensor_striding() +EIGEN_DECLARE_TEST(cxx11_tensor_striding) { CALL_SUBTEST(test_simple_striding()); CALL_SUBTEST(test_simple_striding()); diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp index 603c3746f..d3b1fa77c 100644 --- a/unsupported/test/cxx11_tensor_striding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -196,7 +196,7 @@ template void tensorStridingPerDevice(Dev_selector& s){ test_striding_as_lvalue(sycl_device); } -void test_cxx11_tensor_striding_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorStridingPerDevice(device)); } diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index 2f56eb495..2ca5c47db 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() { } } -void test_cxx11_tensor_sugar() +EIGEN_DECLARE_TEST(cxx11_tensor_sugar) { CALL_SUBTEST(test_comparison_sugar()); CALL_SUBTEST(test_scalar_sugar_add_mul()); diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 5cd0f4c71..9357bed02 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -15,7 +15,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -269,7 +269,7 @@ template void sycl_computing_test_per_ test_sycl_cast(sycl_device); } -void test_cxx11_tensor_sycl() { +EIGEN_DECLARE_TEST(cxx11_tensor_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_computing_test_per_device(device)); } diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp index d680e9b3b..fed269a9a 100644 --- a/unsupported/test/cxx11_tensor_symmetry.cpp +++ b/unsupported/test/cxx11_tensor_symmetry.cpp @@ -801,7 +801,7 @@ static void test_tensor_randacc() } } -void test_cxx11_tensor_symmetry() +EIGEN_DECLARE_TEST(cxx11_tensor_symmetry) { CALL_SUBTEST(test_symgroups_static()); CALL_SUBTEST(test_symgroups_dynamic()); diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index ea9d8afdc..20a197f2b 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -399,7 +399,7 @@ void test_multithread_shuffle() } -void test_cxx11_tensor_thread_pool() +EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) { CALL_SUBTEST_1(test_multithread_elementwise()); CALL_SUBTEST_1(test_multithread_compound_assignment()); diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp index 340d1211c..1579bc1eb 100644 --- a/unsupported/test/cxx11_tensor_trace.cpp +++ b/unsupported/test/cxx11_tensor_trace.cpp @@ -159,7 +159,7 @@ static void test_trace_in_expr() { } -void test_cxx11_tensor_trace() { +EIGEN_DECLARE_TEST(cxx11_tensor_trace) { CALL_SUBTEST(test_0D_trace()); CALL_SUBTEST(test_0D_trace()); CALL_SUBTEST(test_all_dimensions_trace()); diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index d2a1e8673..07691df98 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -144,7 +144,7 @@ void test_misc2() { #endif -void test_cxx11_tensor_uint128() +EIGEN_DECLARE_TEST(cxx11_tensor_uint128) { #ifdef EIGEN_NO_INT128 // Skip the test on compilers that don't support 128bit integers natively diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp index ca6840f3b..3aa363eb5 100644 --- a/unsupported/test/cxx11_tensor_volume_patch.cpp +++ b/unsupported/test/cxx11_tensor_volume_patch.cpp @@ -105,7 +105,7 @@ static void test_entire_volume_patch() } } -void test_cxx11_tensor_volume_patch() +EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch) { CALL_SUBTEST(test_single_voxel_patch()); CALL_SUBTEST(test_entire_volume_patch()); diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp index 039715abc..ca7994fd9 100644 --- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp @@ -13,7 +13,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_volume_patch_sycl + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -214,7 +214,7 @@ std::cout << "Running on " << s.template get_info( test_single_voxel_patch_sycl(sycl_device); test_entire_volume_patch_sycl(sycl_device); } -void test_cxx11_tensor_volume_patch_sycl() +EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp index 2b11807c8..04f5ad670 100644 --- a/unsupported/test/dgmres.cpp +++ b/unsupported/test/dgmres.cpp @@ -24,7 +24,7 @@ template void test_dgmres_T() //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor) ); } -void test_dgmres() +EIGEN_DECLARE_TEST(dgmres) { CALL_SUBTEST_1(test_dgmres_T()); CALL_SUBTEST_2(test_dgmres_T >()); diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp index 6d0ae738d..688594eff 100644 --- a/unsupported/test/forward_adolc.cpp +++ b/unsupported/test/forward_adolc.cpp @@ -119,7 +119,7 @@ template void adolc_forward_jacobian(const Func& f) VERIFY_IS_APPROX(j, jref); } -void test_forward_adolc() +EIGEN_DECLARE_TEST(forward_adolc) { adtl::setNumDir(NUMBER_DIRECTIONS); diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp index f2969116b..8d2254b5b 100644 --- a/unsupported/test/gmres.cpp +++ b/unsupported/test/gmres.cpp @@ -24,7 +24,7 @@ template void test_gmres_T() //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor) ); } -void test_gmres() +EIGEN_DECLARE_TEST(gmres) { CALL_SUBTEST_1(test_gmres_T()); CALL_SUBTEST_2(test_gmres_T >()); diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp index e770049e5..4f143b6de 100644 --- a/unsupported/test/kronecker_product.cpp +++ b/unsupported/test/kronecker_product.cpp @@ -83,7 +83,7 @@ void check_sparse_kronecker_product(const MatrixType& ab) } -void test_kronecker_product() +EIGEN_DECLARE_TEST(kronecker_product) { // DM = dense matrix; SM = sparse matrix @@ -240,7 +240,7 @@ void test_kronecker_product() #include "main.h" #include -void test_kronecker_product() +EIGEN_DECLARE_TEST(kronecker_product) { MatrixXd a(2,2), b(3,3), c; a.setRandom(); diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 64f168c16..7f9a81cd3 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -1445,7 +1445,7 @@ void testNistEckerle4(void) VERIFY_IS_APPROX(x[2], 4.5154121844E+02); } -void test_levenberg_marquardt() +EIGEN_DECLARE_TEST(levenberg_marquardt) { // Tests using the examples provided by (c)minpack CALL_SUBTEST(testLmder1()); diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp index 50dec083d..b032cbf1d 100644 --- a/unsupported/test/matrix_exponential.cpp +++ b/unsupported/test/matrix_exponential.cpp @@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol) } } -void test_matrix_exponential() +EIGEN_DECLARE_TEST(matrix_exponential) { CALL_SUBTEST_2(test2dRotation(1e-13)); CALL_SUBTEST_1(test2dRotation(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 7c9b68a3c..93fb71430 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -181,7 +181,7 @@ void testMatrixType(const MatrixType& m) } } -void test_matrix_function() +EIGEN_DECLARE_TEST(matrix_function) { CALL_SUBTEST_1(testMatrixType(Matrix())); CALL_SUBTEST_2(testMatrixType(Matrix3cf())); diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index 7ccfacfdf..fa52d256e 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -150,7 +150,7 @@ typedef Matrix Matrix3dRowMajor; typedef Matrix Matrix3e; typedef Matrix MatrixXe; -void test_matrix_power() +EIGEN_DECLARE_TEST(matrix_power) { CALL_SUBTEST_2(test2dRotation(1e-13)); CALL_SUBTEST_1(test2dRotation(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp index ea541e1ea..034f29217 100644 --- a/unsupported/test/matrix_square_root.cpp +++ b/unsupported/test/matrix_square_root.cpp @@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m) VERIFY_IS_APPROX(sqrtA * sqrtA, A); } -void test_matrix_square_root() +EIGEN_DECLARE_TEST(matrix_square_root) { for (int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf())); diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index 8b300b78a..2eb40fef6 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -36,7 +36,7 @@ template void test_minres_T() } -void test_minres() +EIGEN_DECLARE_TEST(minres) { CALL_SUBTEST_1(test_minres_T()); // CALL_SUBTEST_2(test_minres_T >()); diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp index 685e7ea45..4a25e993c 100644 --- a/unsupported/test/mpreal_support.cpp +++ b/unsupported/test/mpreal_support.cpp @@ -7,7 +7,7 @@ using namespace mpfr; using namespace Eigen; -void test_mpreal_support() +EIGEN_DECLARE_TEST(mpreal_support) { // set precision to 256 bits (double has only 53 bits) mpreal::set_default_prec(256); diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp index 706a816f7..460830086 100644 --- a/unsupported/test/openglsupport.cpp +++ b/unsupported/test/openglsupport.cpp @@ -106,7 +106,7 @@ GLint createShader(const char* vtx, const char* frg) return prg_id; } -void test_openglsupport() +EIGEN_DECLARE_TEST(openglsupport) { int argc = 0; glutInit(&argc, 0); diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp index 7ad4aa69d..65efea0cb 100644 --- a/unsupported/test/polynomialsolver.cpp +++ b/unsupported/test/polynomialsolver.cpp @@ -191,7 +191,7 @@ void polynomialsolver(int deg) realRoots ); } -void test_polynomialsolver() +EIGEN_DECLARE_TEST(polynomialsolver) { for(int i = 0; i < g_repeat; i++) { diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp index 5fc968402..8ff451996 100644 --- a/unsupported/test/polynomialutils.cpp +++ b/unsupported/test/polynomialutils.cpp @@ -101,7 +101,7 @@ template void CauchyBounds_scalar() internal::random(18,26) )) ); } -void test_polynomialutils() +EIGEN_DECLARE_TEST(polynomialutils) { for(int i = 0; i < g_repeat; i++) { diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp index 7cf4a77c3..4ac53a9a7 100644 --- a/unsupported/test/sparse_extra.cpp +++ b/unsupported/test/sparse_extra.cpp @@ -142,7 +142,7 @@ void check_marketio() VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2)); } -void test_sparse_extra() +EIGEN_DECLARE_TEST(sparse_extra) { for(int i = 0; i < g_repeat; i++) { int s = Eigen::internal::random(1,50); diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp index 802e16150..50dae6c93 100644 --- a/unsupported/test/special_functions.cpp +++ b/unsupported/test/special_functions.cpp @@ -472,7 +472,7 @@ template void array_special_functions() #endif // EIGEN_HAS_C99_MATH } -void test_special_functions() +EIGEN_DECLARE_TEST(special_functions) { CALL_SUBTEST_1(array_special_functions()); CALL_SUBTEST_2(array_special_functions()); diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp index 3be020434..88ec87b97 100644 --- a/unsupported/test/splines.cpp +++ b/unsupported/test/splines.cpp @@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d() } } -void test_splines() +EIGEN_DECLARE_TEST(splines) { for (int i = 0; i < g_repeat; ++i) { From dff3a92d527fd38c28152ab9259af1904a01f248 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 17 Jul 2018 15:52:58 +0200 Subject: [PATCH 256/680] Remove usage of #if EIGEN_TEST_PART_XX in unit tests that does not require them (splitting can thus be avoided for them) --- test/adjoint.cpp | 56 +++++++++++----------- test/array_reverse.cpp | 16 ++++--- test/basicstuff.cpp | 5 +- test/eigensolver_generic.cpp | 54 +++++++++++---------- test/incomplete_cholesky.cpp | 20 ++++---- test/integer_types.cpp | 20 ++++---- test/inverse.cpp | 91 ++++++++++++++++++++---------------- test/linearstructure.cpp | 29 ++++++------ test/nullary.cpp | 76 ++++++++++++++++-------------- test/product_large.cpp | 34 ++++++++------ test/product_small.cpp | 58 ++++++++++++----------- test/sparse_basic.cpp | 29 ++++++------ 12 files changed, 267 insertions(+), 221 deletions(-) diff --git a/test/adjoint.cpp b/test/adjoint.cpp index ca44466b2..4e1e4b5e8 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -145,6 +145,34 @@ template void adjoint(const MatrixType& m) VERIFY_IS_APPROX(rv1.template cast().dot(v1), rv1.dot(v1)); } +template +void adjoint_extra() +{ + MatrixXcf a(10,10), b(10,10); + VERIFY_RAISES_ASSERT(a = a.transpose()); + VERIFY_RAISES_ASSERT(a = a.transpose() + b); + VERIFY_RAISES_ASSERT(a = b + a.transpose()); + VERIFY_RAISES_ASSERT(a = a.conjugate().transpose()); + VERIFY_RAISES_ASSERT(a = a.adjoint()); + VERIFY_RAISES_ASSERT(a = a.adjoint() + b); + VERIFY_RAISES_ASSERT(a = b + a.adjoint()); + + // no assertion should be triggered for these cases: + a.transpose() = a.transpose(); + a.transpose() += a.transpose(); + a.transpose() += a.transpose() + b; + a.transpose() = a.adjoint(); + a.transpose() += a.adjoint(); + a.transpose() += a.adjoint() + b; + + // regression tests for check_for_aliasing + MatrixXd c(10,10); + c = 1.0 * MatrixXd::Ones(10,10) + c; + c = MatrixXd::Ones(10,10) * 1.0 + c; + c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) ); + c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10); +} + EIGEN_DECLARE_TEST(adjoint) { for(int i = 0; i < g_repeat; i++) { @@ -168,32 +196,6 @@ EIGEN_DECLARE_TEST(adjoint) // test a large static matrix only once CALL_SUBTEST_7( adjoint(Matrix()) ); -#ifdef EIGEN_TEST_PART_13 - { - MatrixXcf a(10,10), b(10,10); - VERIFY_RAISES_ASSERT(a = a.transpose()); - VERIFY_RAISES_ASSERT(a = a.transpose() + b); - VERIFY_RAISES_ASSERT(a = b + a.transpose()); - VERIFY_RAISES_ASSERT(a = a.conjugate().transpose()); - VERIFY_RAISES_ASSERT(a = a.adjoint()); - VERIFY_RAISES_ASSERT(a = a.adjoint() + b); - VERIFY_RAISES_ASSERT(a = b + a.adjoint()); - - // no assertion should be triggered for these cases: - a.transpose() = a.transpose(); - a.transpose() += a.transpose(); - a.transpose() += a.transpose() + b; - a.transpose() = a.adjoint(); - a.transpose() += a.adjoint(); - a.transpose() += a.adjoint() + b; - - // regression tests for check_for_aliasing - MatrixXd c(10,10); - c = 1.0 * MatrixXd::Ones(10,10) + c; - c = MatrixXd::Ones(10,10) * 1.0 + c; - c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) ); - c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10); - } -#endif + CALL_SUBTEST_13( adjoint_extra<0>() ); } diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index 317b9a6e2..e23159def 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -123,6 +123,15 @@ template void reverse(const MatrixType& m) VERIFY_IS_APPROX(x, m1(r, cols - 1 - c)); } +template +void array_reverse_extra() +{ + Vector4f x; x << 1, 2, 3, 4; + Vector4f y; y << 4, 3, 2, 1; + VERIFY(x.reverse()[1] == 3); + VERIFY(x.reverse() == y); +} + EIGEN_DECLARE_TEST(array_reverse) { for(int i = 0; i < g_repeat; i++) { @@ -136,10 +145,5 @@ EIGEN_DECLARE_TEST(array_reverse) CALL_SUBTEST_8( reverse(Matrix()) ); CALL_SUBTEST_9( reverse(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } -#ifdef EIGEN_TEST_PART_3 - Vector4f x; x << 1, 2, 3, 4; - Vector4f y; y << 4, 3, 2, 1; - VERIFY(x.reverse()[1] == 3); - VERIFY(x.reverse() == y); -#endif + CALL_SUBTEST_3( array_reverse_extra<0>() ); } diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp index 5ad4090cd..85af603d8 100644 --- a/test/basicstuff.cpp +++ b/test/basicstuff.cpp @@ -194,7 +194,7 @@ template void basicStuffComplex(const MatrixType& m) VERIFY(!static_cast(cm).imag().isZero()); } -#ifdef EIGEN_TEST_PART_2 +template void casting() { Matrix4f m = Matrix4f::Random(), m2; @@ -203,7 +203,6 @@ void casting() m2 = m.cast(); // check the specialization when NewType == Type VERIFY(m.isApprox(m2)); } -#endif template void fixedSizeMatrixConstruction() @@ -290,5 +289,5 @@ EIGEN_DECLARE_TEST(basicstuff) CALL_SUBTEST_1(fixedSizeMatrixConstruction()); CALL_SUBTEST_1(fixedSizeMatrixConstruction()); - CALL_SUBTEST_2(casting()); + CALL_SUBTEST_2(casting<0>()); } diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index b41186819..e0e435151 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -100,6 +100,34 @@ template void eigensolver_verify_assert(const MatrixType& m VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors()); } +template +void eigensolver_generic_extra() +{ + { + // regression test for bug 793 + MatrixXd a(3,3); + a << 0, 0, 1, + 1, 1, 1, + 1, 1e+200, 1; + Eigen::EigenSolver eig(a); + double scale = 1e-200; // scale to avoid overflow during the comparisons + VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale); + VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale); + } + { + // check a case where all eigenvalues are null. + MatrixXd a(2,2); + a << 1, 1, + -1, -1; + Eigen::EigenSolver eig(a); + VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.); + VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.); + VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.); + VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.); + VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.); + } +} + EIGEN_DECLARE_TEST(eigensolver_generic) { int s = 0; @@ -135,31 +163,7 @@ EIGEN_DECLARE_TEST(eigensolver_generic) } ); -#ifdef EIGEN_TEST_PART_2 - { - // regression test for bug 793 - MatrixXd a(3,3); - a << 0, 0, 1, - 1, 1, 1, - 1, 1e+200, 1; - Eigen::EigenSolver eig(a); - double scale = 1e-200; // scale to avoid overflow during the comparisons - VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale); - VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale); - } - { - // check a case where all eigenvalues are null. - MatrixXd a(2,2); - a << 1, 1, - -1, -1; - Eigen::EigenSolver eig(a); - VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.); - VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.); - VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.); - VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.); - VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.); - } -#endif + CALL_SUBTEST_2( eigensolver_generic_extra<0>() ); TEST_SET_BUT_UNUSED_VARIABLE(s) } diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 52235c2c2..68fe7d507 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -29,14 +29,10 @@ template void test_incomplete_cholesky_T() CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) ); } -EIGEN_DECLARE_TEST(incomplete_cholesky) +template +void bug1150() { - CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); - CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); - CALL_SUBTEST_3(( test_incomplete_cholesky_T() )); - -#ifdef EIGEN_TEST_PART_1 - // regression for bug 1150 + // regression for bug 1150 for(int N = 1; N<20; ++N) { Eigen::MatrixXd b( N, N ); @@ -61,5 +57,13 @@ EIGEN_DECLARE_TEST(incomplete_cholesky) VERIFY(solver.preconditioner().info() == Eigen::Success); VERIFY(solver.info() == Eigen::Success); } -#endif +} + +EIGEN_DECLARE_TEST(incomplete_cholesky) +{ + CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); + CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); + CALL_SUBTEST_3(( test_incomplete_cholesky_T() )); + + CALL_SUBTEST_1(( bug1150<0>() )); } diff --git a/test/integer_types.cpp b/test/integer_types.cpp index 88573c568..e9e514f12 100644 --- a/test/integer_types.cpp +++ b/test/integer_types.cpp @@ -131,6 +131,17 @@ template void integer_type_tests(const MatrixType& m) VERIFY_IS_APPROX((m1 * m2.transpose()) * m1, m1 * (m2.transpose() * m1)); } +template +void integer_types_extra() +{ + VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); + VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); + if(sizeof(long)>sizeof(int)) { + VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); + VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); + } +} + EIGEN_DECLARE_TEST(integer_types) { for(int i = 0; i < g_repeat; i++) { @@ -156,12 +167,5 @@ EIGEN_DECLARE_TEST(integer_types) CALL_SUBTEST_8( integer_type_tests(Matrix(1, 5)) ); } -#ifdef EIGEN_TEST_PART_9 - VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); - VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); - if(sizeof(long)>sizeof(int)) { - VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); - VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); - } -#endif + CALL_SUBTEST_9( integer_types_extra<0>() ); } diff --git a/test/inverse.cpp b/test/inverse.cpp index b989c7905..8754cb7e5 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -11,9 +11,59 @@ #include "main.h" #include -template void inverse(const MatrixType& m) +template +void inverse_for_fixed_size(const MatrixType&, typename internal::enable_if::type* = 0) +{ +} + +template +void inverse_for_fixed_size(const MatrixType& m1, typename internal::enable_if::type* = 0) { using std::abs; + + MatrixType m2, identity = MatrixType::Identity(); + + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef Matrix VectorType; + + //computeInverseAndDetWithCheck tests + //First: an invertible matrix + bool invertible; + Scalar det; + + m2.setZero(); + m1.computeInverseAndDetWithCheck(m2, det, invertible); + VERIFY(invertible); + VERIFY_IS_APPROX(identity, m1*m2); + VERIFY_IS_APPROX(det, m1.determinant()); + + m2.setZero(); + m1.computeInverseWithCheck(m2, invertible); + VERIFY(invertible); + VERIFY_IS_APPROX(identity, m1*m2); + + //Second: a rank one matrix (not invertible, except for 1x1 matrices) + VectorType v3 = VectorType::Random(); + MatrixType m3 = v3*v3.transpose(), m4; + m3.computeInverseAndDetWithCheck(m4, det, invertible); + VERIFY( m1.rows()==1 ? invertible : !invertible ); + VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1)); + m3.computeInverseWithCheck(m4, invertible); + VERIFY( m1.rows()==1 ? invertible : !invertible ); + + // check with submatrices + { + Matrix m5; + m5.setRandom(); + m5.topLeftCorner(m1.rows(),m1.rows()) = m1; + m2 = m5.template topLeftCorner().inverse(); + VERIFY_IS_APPROX( (m5.template topLeftCorner()), m2.inverse() ); + } +} + +template void inverse(const MatrixType& m) +{ /* this test covers the following files: Inverse.h */ @@ -39,44 +89,7 @@ template void inverse(const MatrixType& m) // since for the general case we implement separately row-major and col-major, test that VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose())); -#if !defined(EIGEN_TEST_PART_5) && !defined(EIGEN_TEST_PART_6) - typedef typename NumTraits::Real RealScalar; - typedef Matrix VectorType; - - //computeInverseAndDetWithCheck tests - //First: an invertible matrix - bool invertible; - Scalar det; - - m2.setZero(); - m1.computeInverseAndDetWithCheck(m2, det, invertible); - VERIFY(invertible); - VERIFY_IS_APPROX(identity, m1*m2); - VERIFY_IS_APPROX(det, m1.determinant()); - - m2.setZero(); - m1.computeInverseWithCheck(m2, invertible); - VERIFY(invertible); - VERIFY_IS_APPROX(identity, m1*m2); - - //Second: a rank one matrix (not invertible, except for 1x1 matrices) - VectorType v3 = VectorType::Random(rows); - MatrixType m3 = v3*v3.transpose(), m4(rows,cols); - m3.computeInverseAndDetWithCheck(m4, det, invertible); - VERIFY( rows==1 ? invertible : !invertible ); - VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1)); - m3.computeInverseWithCheck(m4, invertible); - VERIFY( rows==1 ? invertible : !invertible ); - - // check with submatrices - { - Matrix m5; - m5.setRandom(); - m5.topLeftCorner(rows,rows) = m1; - m2 = m5.template topLeftCorner().inverse(); - VERIFY_IS_APPROX( (m5.template topLeftCorner()), m2.inverse() ); - } -#endif + inverse_for_fixed_size(m1); // check in-place inversion if(MatrixType::RowsAtCompileTime>=2 && MatrixType::RowsAtCompileTime<=4) diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp index 4137b0251..46ee5162b 100644 --- a/test/linearstructure.cpp +++ b/test/linearstructure.cpp @@ -110,6 +110,19 @@ template void real_complex(DenseIndex rows = MatrixType::Ro VERIFY(g_called && "matrix - real not properly optimized"); } +template +void linearstructure_overflow() +{ + // make sure that /=scalar and /scalar do not overflow + // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not + Matrix4d m2, m3; + m3 = m2 = Matrix4d::Random()*1e-20; + m2 = m2 / 4.9e-320; + VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones()); + m3 /= 4.9e-320; + VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones()); +} + EIGEN_DECLARE_TEST(linearstructure) { g_called = true; @@ -130,19 +143,5 @@ EIGEN_DECLARE_TEST(linearstructure) CALL_SUBTEST_11( real_complex(10,10) ); CALL_SUBTEST_11( real_complex(10,10) ); } - -#ifdef EIGEN_TEST_PART_4 - { - // make sure that /=scalar and /scalar do not overflow - // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not - Matrix4d m2, m3; - m3 = m2 = Matrix4d::Random()*1e-20; - m2 = m2 / 4.9e-320; - VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones()); - m3 /= 4.9e-320; - VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones()); - - - } -#endif + CALL_SUBTEST_4( linearstructure_overflow<0>() ); } diff --git a/test/nullary.cpp b/test/nullary.cpp index 9e6e6eaa0..12b9e122f 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -239,45 +239,28 @@ void testMatrixType(const MatrixType& m) VERIFY_IS_APPROX( A(i,j), s1 ); } -EIGEN_DECLARE_TEST(nullary) +template +void bug79() { - CALL_SUBTEST_1( testMatrixType(Matrix2d()) ); - CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random(1,300),internal::random(1,300))) ); - CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random(1,300),internal::random(1,300))) ); - - for(int i = 0; i < g_repeat*10; i++) { - CALL_SUBTEST_4( testVectorType(VectorXd(internal::random(1,30000))) ); - CALL_SUBTEST_5( testVectorType(Vector4d()) ); // regression test for bug 232 - CALL_SUBTEST_6( testVectorType(Vector3d()) ); - CALL_SUBTEST_7( testVectorType(VectorXf(internal::random(1,30000))) ); - CALL_SUBTEST_8( testVectorType(Vector3f()) ); - CALL_SUBTEST_8( testVectorType(Vector4f()) ); - CALL_SUBTEST_8( testVectorType(Matrix()) ); - CALL_SUBTEST_8( testVectorType(Matrix()) ); - - CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,10))) ); - CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(9,300))) ); - CALL_SUBTEST_9( testVectorType(Matrix()) ); - } - -#ifdef EIGEN_TEST_PART_6 // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79). VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits::epsilon() ); -#endif +} -#ifdef EIGEN_TEST_PART_9 +template +void nullary_overflow() +{ // Check possible overflow issue - { - int n = 60000; - ArrayXi a1(n), a2(n); - a1.setLinSpaced(n, 0, n-1); - for(int i=0; i +void nullary_internal_logic() +{ // check some internal logic VERIFY(( internal::has_nullary_operator >::value )); VERIFY(( !internal::has_unary_operator >::value )); @@ -318,5 +301,30 @@ EIGEN_DECLARE_TEST(nullary) VERIFY(( !internal::has_binary_operator >::value )); VERIFY(( internal::functor_has_linear_access >::ret )); } -#endif +} + +EIGEN_DECLARE_TEST(nullary) +{ + CALL_SUBTEST_1( testMatrixType(Matrix2d()) ); + CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random(1,300),internal::random(1,300))) ); + CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random(1,300),internal::random(1,300))) ); + + for(int i = 0; i < g_repeat*10; i++) { + CALL_SUBTEST_4( testVectorType(VectorXd(internal::random(1,30000))) ); + CALL_SUBTEST_5( testVectorType(Vector4d()) ); // regression test for bug 232 + CALL_SUBTEST_6( testVectorType(Vector3d()) ); + CALL_SUBTEST_7( testVectorType(VectorXf(internal::random(1,30000))) ); + CALL_SUBTEST_8( testVectorType(Vector3f()) ); + CALL_SUBTEST_8( testVectorType(Vector4f()) ); + CALL_SUBTEST_8( testVectorType(Matrix()) ); + CALL_SUBTEST_8( testVectorType(Matrix()) ); + + CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,10))) ); + CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(9,300))) ); + CALL_SUBTEST_9( testVectorType(Matrix()) ); + } + + CALL_SUBTEST_6( bug79<0>() ); + CALL_SUBTEST_9( nullary_overflow<0>() ); + CALL_SUBTEST_10( nullary_internal_logic<0>() ); } diff --git a/test/product_large.cpp b/test/product_large.cpp index 1b6fec738..ae14b714c 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -30,21 +30,9 @@ void test_aliasing() x = z; } -EIGEN_DECLARE_TEST(product_large) +template +void product_large_regressions() { - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( product(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); - CALL_SUBTEST_2( product(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); - CALL_SUBTEST_2( product(MatrixXd(internal::random(1,10), internal::random(1,10))) ); - - CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); - CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); - CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); - - CALL_SUBTEST_1( test_aliasing() ); - } - -#if defined EIGEN_TEST_PART_6 { // test a specific issue in DiagonalProduct int N = 1000000; @@ -97,7 +85,23 @@ EIGEN_DECLARE_TEST(product_large) * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))); VERIFY_IS_APPROX(B,C); } -#endif +} + +EIGEN_DECLARE_TEST(product_large) +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( product(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_2( product(MatrixXd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_2( product(MatrixXd(internal::random(1,10), internal::random(1,10))) ); + + CALL_SUBTEST_3( product(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_4( product(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); + CALL_SUBTEST_5( product(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + + CALL_SUBTEST_1( test_aliasing() ); + } + + CALL_SUBTEST_6( product_large_regressions<0>() ); // Regression test for bug 714: #if defined EIGEN_HAS_OPENMP diff --git a/test/product_small.cpp b/test/product_small.cpp index 16138631a..b8ce37d90 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -228,6 +228,36 @@ void bug_1311() VERIFY_IS_APPROX(res, A*b); } +template +void product_small_regressions() +{ + { + // test compilation of (outer_product) * vector + Vector3f v = Vector3f::Random(); + VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v); + } + + { + // regression test for pull-request #93 + Eigen::Matrix A; A.setRandom(); + Eigen::Matrix B; B.setRandom(); + Eigen::Matrix C; C.setRandom(); + VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]); + VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C); + } + + { + Eigen::Matrix A, B, C; + A.setRandom(); + C = A; + for(int k=0; k<79; ++k) + C = C * A; + B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))) + * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))); + VERIFY_IS_APPROX(B,C); + } +} + EIGEN_DECLARE_TEST(product_small) { for(int i = 0; i < g_repeat; i++) { @@ -263,31 +293,5 @@ EIGEN_DECLARE_TEST(product_small) CALL_SUBTEST_6( bug_1311<5>() ); } -#ifdef EIGEN_TEST_PART_6 - { - // test compilation of (outer_product) * vector - Vector3f v = Vector3f::Random(); - VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v); - } - - { - // regression test for pull-request #93 - Eigen::Matrix A; A.setRandom(); - Eigen::Matrix B; B.setRandom(); - Eigen::Matrix C; C.setRandom(); - VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]); - VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C); - } - - { - Eigen::Matrix A, B, C; - A.setRandom(); - C = A; - for(int k=0; k<79; ++k) - C = C * A; - B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))) - * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))); - VERIFY_IS_APPROX(B,C); - } -#endif + CALL_SUBTEST_6( product_small_regressions<0>() ); } diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index bfd61f69f..3691e8dad 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -640,6 +640,20 @@ void big_sparse_triplet(Index rows, Index cols, double density) { VERIFY_IS_APPROX(sum, m.sum()); } +template +void bug1105() +{ + // Regression test for bug 1105 + int n = Eigen::internal::random(200,600); + SparseMatrix,0, long> mat(n, n); + std::complex val; + + for(int i=0; i >(10000, 10000, 0.125))); CALL_SUBTEST_4((big_sparse_triplet >(10000, 10000, 0.125))); - // Regression test for bug 1105 -#ifdef EIGEN_TEST_PART_7 - { - int n = Eigen::internal::random(200,600); - SparseMatrix,0, long> mat(n, n); - std::complex val; - - for(int i=0; i() ); } From f124f0796533081cb7b061c4aa8667df6ed58863 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Tue, 17 Jul 2018 14:16:48 -0400 Subject: [PATCH 257/680] applying EIGEN_DECLARE_TEST to *gpu* tests Also, a few minor fixes for GPU tests running in HIP mode. 1. Adding an include for hip/hip_runtime.h in the Macros.h file For HIP __host__ and __device__ are macros which are defined in hip headers. Their definitions need to be included before their use in the file. 2. Fixing the compile failure in TensorContractionGpu introduced by the commit to "Fuse computations into the Tensor contractions using output kernel" 3. Fixing a HIP/clang specific compile error by making the struct-member assignment explicit --- Eigen/src/Core/util/Macros.h | 6 ++++-- Eigen/src/Core/util/Memory.h | 2 +- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 7 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 3 +-- .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h | 10 +++++----- unsupported/test/cxx11_tensor_argmax_gpu.cu | 2 +- unsupported/test/cxx11_tensor_cast_float16_gpu.cu | 2 +- unsupported/test/cxx11_tensor_contract_gpu.cu | 2 +- unsupported/test/cxx11_tensor_device.cu | 2 +- unsupported/test/cxx11_tensor_gpu.cu | 2 +- unsupported/test/cxx11_tensor_of_float16_gpu.cu | 2 +- unsupported/test/cxx11_tensor_random_gpu.cu | 2 +- unsupported/test/cxx11_tensor_reduction_gpu.cu | 2 +- unsupported/test/cxx11_tensor_scan_gpu.cu | 2 +- 14 files changed, 24 insertions(+), 22 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index fc4c0815c..adf25ee9b 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -395,8 +395,10 @@ // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) #define EIGEN_HIPCC __HIPCC__ - // We need hip_common.h here because __HIP_DEVICE_COMPILE__ is defined in this header. - #include + // We need to include hip_runtime.h here because it pulls in + // ++ hip_common.h which contains the define for __HIP_DEVICE_COMPILE__ + // ++ host_defines.h which contains the defines for the __host__ and __device__ macros + #include #if defined(__HIP_DEVICE_COMPILE__) // analogous to EIGEN_CUDA_ARCH, but for HIP diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 956f64d1d..8dbd4d93b 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -580,7 +580,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#if ! defined EIGEN_ALLOCA && ! defined EIGEN_CUDA_ARCH +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index fbc1ee2f6..2d1a82c57 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -610,6 +610,7 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD_MATH(sqrt); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -620,8 +621,8 @@ template struct direct_selfadjoint_eigenvaluesn1) res = c0/std::sqrt(n0); - else res = c1/std::sqrt(n1); + if(n0>n1) res = c0/sqrt(n0); + else res = c1/sqrt(n1); return true; } @@ -723,7 +724,7 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) { - using std::sqrt; + EIGEN_USING_STD_MATH(sqrt); const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0))); const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1)); roots(0) = t1 - t0; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 0fbffa34c..c7c443a59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -449,8 +449,7 @@ struct TensorContractionEvaluatorBase // tensor dimensions (i, j) into the original tensor dimensions. // TODO(ezhulenev): Add parameters required to infer output tensor index for // more complex contractions than 2x2 on internal dimension. - m_tensor_contraction_params = { - /**swapped_arguments=*/static_cast(Layout) == RowMajor}; + m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index a4f92ee44..b9956cd43 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -1215,16 +1215,16 @@ EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, } -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { typedef GpuDevice Device; - typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu index f6c3a9908..79f4066e9 100644 --- a/unsupported/test/cxx11_tensor_argmax_gpu.cu +++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu @@ -242,7 +242,7 @@ void test_gpu_argmin_dim() } } -void test_cxx11_tensor_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu) { CALL_SUBTEST_1(test_gpu_simple_argmax()); CALL_SUBTEST_1(test_gpu_simple_argmax()); diff --git a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu index 0a37a555c..97923d15f 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu @@ -72,7 +72,7 @@ void test_fallback_conversion() { } -void test_cxx11_tensor_cast_float16_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu) { CALL_SUBTEST(test_gpu_conversion()); CALL_SUBTEST(test_fallback_conversion()); diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index cb1416478..575bdc1f9 100644 --- a/unsupported/test/cxx11_tensor_contract_gpu.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -193,7 +193,7 @@ void test_gpu_contraction_sizes() { } } -void test_cxx11_tensor_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu) { CALL_SUBTEST_1(test_gpu_contraction(128, 128, 128)); CALL_SUBTEST_1(test_gpu_contraction(128, 128, 128)); diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index cd9ba3ecd..c9f78d2d3 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -389,7 +389,7 @@ void test_gpu() { } -void test_cxx11_tensor_device() +EIGEN_DECLARE_TEST(cxx11_tensor_device) { CALL_SUBTEST_1(test_cpu()); CALL_SUBTEST_2(test_gpu()); diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index faaac73cf..14fc0bd04 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -1472,7 +1472,7 @@ void test_gpu_gamma_sample_der_alpha() gpuFree(d_out); } -void test_cxx11_tensor_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_gpu) { CALL_SUBTEST_1(test_gpu_nullary()); CALL_SUBTEST_1(test_gpu_elementwise_small()); diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 1f1ec26c2..4d74e6138 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -479,7 +479,7 @@ void test_gpu_forced_evals() { #endif -void test_cxx11_tensor_of_float16_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) { CALL_SUBTEST_1(test_gpu_numext()); diff --git a/unsupported/test/cxx11_tensor_random_gpu.cu b/unsupported/test/cxx11_tensor_random_gpu.cu index 262182d30..090986ebc 100644 --- a/unsupported/test/cxx11_tensor_random_gpu.cu +++ b/unsupported/test/cxx11_tensor_random_gpu.cu @@ -78,7 +78,7 @@ static void test_complex() } -void test_cxx11_tensor_random_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu) { CALL_SUBTEST(test_gpu_random_uniform()); CALL_SUBTEST(test_gpu_random_normal()); diff --git a/unsupported/test/cxx11_tensor_reduction_gpu.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu index 7b8ac2309..122ac946b 100644 --- a/unsupported/test/cxx11_tensor_reduction_gpu.cu +++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu @@ -134,7 +134,7 @@ static void test_last_dim_reductions() { } -void test_cxx11_tensor_reduction_gpu() { +EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) { CALL_SUBTEST_1((test_full_reductions())); CALL_SUBTEST_1((test_full_reductions())); CALL_SUBTEST_2((test_full_reductions())); diff --git a/unsupported/test/cxx11_tensor_scan_gpu.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu index f3e773db5..770a144f1 100644 --- a/unsupported/test/cxx11_tensor_scan_gpu.cu +++ b/unsupported/test/cxx11_tensor_scan_gpu.cu @@ -71,7 +71,7 @@ void test_gpu_cumsum(int m_size, int k_size, int n_size) } -void test_cxx11_tensor_scan_gpu() +EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu) { CALL_SUBTEST_1(test_gpu_cumsum(128, 128, 128)); CALL_SUBTEST_2(test_gpu_cumsum(128, 128, 128)); From c95aacab90e9d8bb9f9e082395b3b843a530fa41 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 17 Jul 2018 14:09:37 -0700 Subject: [PATCH 258/680] Fix TensorContractionOp evaluators for GPU and SYCL --- .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h | 10 +++++----- .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 13 ++++++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index b9956cd43..6d3aa24c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -505,9 +505,9 @@ template struct LaunchSyclKernels; -template -struct TensorEvaluator, const Eigen::SyclDevice> : - public TensorContractionEvaluatorBase, const Eigen::SyclDevice> > { +template +struct TensorEvaluator, const Eigen::SyclDevice> : + public TensorContractionEvaluatorBase, const Eigen::SyclDevice> > { + + static_assert(std::is_same::value, + "SYCL tensor contraction does not support output kernels."); typedef const Eigen::SyclDevice Device; - typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; From 3a2dc3869ed77cb6eca2beaad5c0af7d5134f15c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Jul 2018 02:26:43 -0700 Subject: [PATCH 259/680] Fix weird issue with MSVC 2013 --- test/mapped_matrix.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp index e1106ba42..0ea136ae6 100644 --- a/test/mapped_matrix.cpp +++ b/test/mapped_matrix.cpp @@ -62,8 +62,9 @@ template void map_class_matrix(const MatrixType& m) for(int i = 0; i < size; i++) array2[i] = Scalar(1); // array3unaligned -> unaligned pointer to heap Scalar* array3 = new Scalar[size+1]; - for(int i = 0; i < size+1; i++) array3[i] = Scalar(1); - Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3; + Index sizep1 = size + 1; // <- without this temporary MSVC 2103 generates bad code + for(Index i = 0; i < sizep1; i++) array3[i] = Scalar(1); + Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3; Scalar array4[256]; if(size<=256) for(int i = 0; i < size; i++) array4[i] = Scalar(1); From 000840cae0568da3c9148ddfe18b2af5b3e24067 Mon Sep 17 00:00:00 2001 From: Viktor Csomor Date: Wed, 7 Feb 2018 19:10:54 +0100 Subject: [PATCH 260/680] Added a move constructor and move assignment operator to Tensor and wrote some tests. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 15 ++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_move.cpp | 81 +++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_move.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 4fd96448f..e3f6e37f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -398,6 +398,21 @@ class Tensor : public TensorBase::run(assign, DefaultDevice()); } + #if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(Self&& other) + : Tensor() + { + m_storage.swap(other.m_storage); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) + { + m_storage.swap(other.m_storage); + return *this; + } + #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 73befbca1..0fc864eca 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -225,6 +225,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_ifft) ei_add_test(cxx11_tensor_scan) ei_add_test(cxx11_tensor_trace) + ei_add_test(cxx11_tensor_move) endif() diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp new file mode 100644 index 000000000..0ab2b7786 --- /dev/null +++ b/unsupported/test/cxx11_tensor_move.cpp @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Viktor Csomor +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void calc_indices(int i, int& x, int& y, int& z) +{ + x = i / 4; + y = (i % 4) / 2; + z = i % 2; +} + +static void test_move() +{ + int x; + int y; + int z; + + Tensor tensor1(2, 2, 2); + Tensor tensor2(2, 2, 2); + + for (int i = 0; i < 8; i++) + { + calc_indices(i, x, y, z); + tensor1(x,y,z) = i; + tensor2(x,y,z) = 2 * i; + } + + // Invokes the move constructor. + Tensor moved_tensor1 = std::move(tensor1); + Tensor moved_tensor2 = std::move(tensor2); + + VERIFY_IS_EQUAL(tensor1.size(), 0); + VERIFY_IS_EQUAL(tensor2.size(), 0); + + for (int i = 0; i < 8; i++) + { + calc_indices(i, x, y, z); + VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i); + VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i); + } + + Tensor moved_tensor3(2,2,2); + Tensor moved_tensor4(2,2,2); + + moved_tensor3.setZero(); + moved_tensor4.setZero(); + + // Invokes the move assignment operator. + moved_tensor3 = std::move(moved_tensor1); + moved_tensor4 = std::move(moved_tensor2); + + VERIFY_IS_EQUAL(moved_tensor1.size(), 8); + VERIFY_IS_EQUAL(moved_tensor2.size(), 8); + + for (int i = 0; i < 8; i++) + { + calc_indices(i, x, y, z); + VERIFY_IS_EQUAL(moved_tensor1(x,y,z), 0); + VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 0); + VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i); + VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i); + } +} + +EIGEN_DECLARE_TEST(cxx11_tensor_move) +{ + CALL_SUBTEST(test_move()); +} From f4a6863c75a14e72bead8b844000ee5239e95f70 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Feb 2018 16:43:49 +0100 Subject: [PATCH 261/680] Fix typo --- Eigen/src/Geometry/Scaling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index c1899a029..8d9acf252 100755 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -77,10 +77,10 @@ public: /** Concatenates a uniform scaling and an affine transformation */ template inline typename - internal::uniformscaling_times_affine_returntype ::type + internal::uniformscaling_times_affine_returntype::type operator* (const Transform& t) const { - typename internal::uniformscaling_times_affine_returntype res = t; + typename internal::uniformscaling_times_affine_returntype::type res = t; res.prescale(factor()); return res; } From 12efc7d41b80259b996be5781bf596c249c90d3f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Feb 2018 16:45:49 +0100 Subject: [PATCH 262/680] Fix linear indexing in generic block evaluation. --- Eigen/src/Core/CoreEvaluators.h | 30 ++++++++++++++++++++++-------- test/block.cpp | 11 +++++++++-- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index d1454117a..8419798c1 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1079,7 +1079,8 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), - m_startCol(block.startCol()) + m_startCol(block.startCol()), + m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) { } typedef typename XprType::Scalar Scalar; @@ -1098,7 +1099,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (InnerPanel) + return m_argImpl.coeff(m_linear_offset.value() + index); + else + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1110,7 +1114,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (InnerPanel) + return m_argImpl.coeffRef(m_linear_offset.value() + index); + else + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } template @@ -1124,8 +1131,11 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return packet(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); + if (InnerPanel) + return m_argImpl.template packet(m_linear_offset.value() + index); + else + return packet(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); } template @@ -1139,15 +1149,19 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return writePacket(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0, - x); + if (InnerPanel) + return m_argImpl.template writePacket(m_linear_offset.value() + index, x); + else + return writePacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, + x); } protected: evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; + const variable_if_dynamic m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; diff --git a/test/block.cpp b/test/block.cpp index d61059874..8c4dd87be 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -44,7 +44,7 @@ template void block(const MatrixType& m) typedef typename MatrixType::RealScalar RealScalar; typedef Matrix VectorType; typedef Matrix RowVectorType; - typedef Matrix DynamicMatrixType; + typedef Matrix DynamicMatrixType; typedef Matrix DynamicVectorType; Index rows = m.rows(); @@ -142,7 +142,7 @@ template void block(const MatrixType& m) VERIFY(numext::real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows)); VERIFY(numext::real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols)); - // chekc that linear acccessors works on blocks + // check that linear acccessors works on blocks m1 = m1_copy; if((MatrixType::Flags&RowMajorBit)==0) VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1)); @@ -166,6 +166,13 @@ template void block(const MatrixType& m) VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); + VERIFY_IS_APPROX( (m1*1).topRows(r1), m1.topRows(r1) ); + VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) ); + VERIFY_IS_APPROX( (m1*1).transpose().topRows(c1), m1.transpose().topRows(c1) ); + VERIFY_IS_APPROX( (m1*1).transpose().leftCols(r1), m1.transpose().leftCols(r1) ); + VERIFY_IS_APPROX( (m1*1).transpose().middleRows(c1,c2-c1+1), m1.transpose().middleRows(c1,c2-c1+1) ); + VERIFY_IS_APPROX( (m1*1).transpose().middleCols(r1,r2-r1+1), m1.transpose().middleCols(r1,r2-r1+1) ); + // evaluation into plain matrices from expressions with direct access (stress MapBase) DynamicMatrixType dm; DynamicVectorType dv; From 5deeb19e7bb19c67abeac0a6cfa26ad3d14e215b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 9 Feb 2018 16:52:35 +0100 Subject: [PATCH 263/680] bug #1517: fix triangular product with unit diagonal and nested scaling factor: (s*A).triangularView()*B --- .../Core/products/TriangularMatrixMatrix.h | 24 ++++++++++++++-- .../Core/products/TriangularMatrixVector.h | 22 ++++++++++++--- test/product_trmm.cpp | 28 +++++++++++++------ 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 539b6c0c6..f784507e7 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -400,7 +400,9 @@ struct triangular_product_impl { template static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typedef typename Dest::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -412,8 +414,9 @@ struct triangular_product_impl typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -438,6 +441,21 @@ struct triangular_product_impl &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } } }; diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 4b292e74d..76bfa159c 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -221,8 +221,9 @@ template struct trmv_selector typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 @@ -274,6 +275,12 @@ template struct trmv_selector else dest = MappedDest(actualDestPtr, dest.size()); } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; @@ -295,8 +302,9 @@ template struct trmv_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 @@ -326,6 +334,12 @@ template struct trmv_selector actualRhsPtr,1, dest.data(),dest.innerStride(), actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; diff --git a/test/product_trmm.cpp b/test/product_trmm.cpp index 12e554410..e08d9f39f 100644 --- a/test/product_trmm.cpp +++ b/test/product_trmm.cpp @@ -29,7 +29,7 @@ void trmm(int rows=get_random_size(), typedef Matrix ResXS; typedef Matrix ResSX; - TriMatrix mat(rows,cols), tri(rows,cols), triTr(cols,rows); + TriMatrix mat(rows,cols), tri(rows,cols), triTr(cols,rows), s1tri(rows,cols), s1triTr(cols,rows); OnTheRight ge_right(cols,otherCols); OnTheLeft ge_left(otherCols,rows); @@ -42,6 +42,8 @@ void trmm(int rows=get_random_size(), mat.setRandom(); tri = mat.template triangularView(); triTr = mat.transpose().template triangularView(); + s1tri = (s1*mat).template triangularView(); + s1triTr = (s1*mat).transpose().template triangularView(); ge_right.setRandom(); ge_left.setRandom(); @@ -51,19 +53,29 @@ void trmm(int rows=get_random_size(), VERIFY_IS_APPROX( ge_xs.noalias() = mat.template triangularView() * ge_right, tri * ge_right); VERIFY_IS_APPROX( ge_sx.noalias() = ge_left * mat.template triangularView(), ge_left * tri); - VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView() * (s2*ge_left.transpose()), s1*triTr.conjugate() * (s2*ge_left.transpose())); - VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.transpose() * mat.adjoint().template triangularView(), ge_right.transpose() * triTr.conjugate()); + if((Mode&UnitDiag)==0) + VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView() * (s2*ge_left.transpose()), s1*triTr.conjugate() * (s2*ge_left.transpose())); - VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView() * (s2*ge_left.adjoint()), s1*triTr.conjugate() * (s2*ge_left.adjoint())); - VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.adjoint() * mat.adjoint().template triangularView(), ge_right.adjoint() * triTr.conjugate()); + VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.transpose()).template triangularView() * (s2*ge_left.transpose()), s1triTr * (s2*ge_left.transpose())); + VERIFY_IS_APPROX( ge_sx.noalias() = (s2*ge_left) * (s1*mat).template triangularView(), (s2*ge_left)*s1tri); + VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.transpose() * mat.adjoint().template triangularView(), ge_right.transpose() * triTr.conjugate()); + VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.adjoint() * mat.adjoint().template triangularView(), ge_right.adjoint() * triTr.conjugate()); + ge_xs_save = ge_xs; - VERIFY_IS_APPROX( (ge_xs_save + s1*triTr.conjugate() * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.adjoint()).template triangularView() * (s2*ge_left.adjoint()) ); + if((Mode&UnitDiag)==0) + VERIFY_IS_APPROX( (ge_xs_save + s1*triTr.conjugate() * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.adjoint()).template triangularView() * (s2*ge_left.adjoint()) ); + ge_xs_save = ge_xs; + VERIFY_IS_APPROX( (ge_xs_save + s1triTr * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.transpose()).template triangularView() * (s2*ge_left.adjoint()) ); ge_sx.setRandom(); ge_sx_save = ge_sx; - VERIFY_IS_APPROX( ge_sx_save - (ge_right.adjoint() * (-s1 * triTr).conjugate()).eval(), ge_sx.noalias() -= (ge_right.adjoint() * (-s1 * mat).adjoint().template triangularView()).eval()); + if((Mode&UnitDiag)==0) + VERIFY_IS_APPROX( ge_sx_save - (ge_right.adjoint() * (-s1 * triTr).conjugate()).eval(), ge_sx.noalias() -= (ge_right.adjoint() * (-s1 * mat).adjoint().template triangularView()).eval()); - VERIFY_IS_APPROX( ge_xs = (s1*mat).adjoint().template triangularView() * ge_left.adjoint(), numext::conj(s1) * triTr.conjugate() * ge_left.adjoint()); + if((Mode&UnitDiag)==0) + VERIFY_IS_APPROX( ge_xs = (s1*mat).adjoint().template triangularView() * ge_left.adjoint(), numext::conj(s1) * triTr.conjugate() * ge_left.adjoint()); + VERIFY_IS_APPROX( ge_xs = (s1*mat).transpose().template triangularView() * ge_left.adjoint(), s1triTr * ge_left.adjoint()); + // TODO check with sub-matrix expressions ? } From e3912f5e63b0c08a0f592ad425e926a5d61e1b8a Mon Sep 17 00:00:00 2001 From: "luz.paz" Date: Sun, 11 Mar 2018 10:01:44 -0400 Subject: [PATCH 264/680] MIsc. source and comment typos Found using `codespell` and `grep` from downstream FreeCAD --- Eigen/Core | 2 +- Eigen/src/Cholesky/LDLT.h | 2 +- Eigen/src/Cholesky/LLT.h | 2 +- Eigen/src/Core/AssignEvaluator.h | 4 ++-- Eigen/src/Core/DenseBase.h | 2 +- Eigen/src/Core/DenseStorage.h | 2 +- Eigen/src/Core/MathFunctions.h | 2 +- Eigen/src/Core/NoAlias.h | 4 ++-- Eigen/src/Core/PlainObjectBase.h | 2 +- Eigen/src/Core/Product.h | 2 +- Eigen/src/Core/Transpositions.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 ++-- Eigen/src/Core/arch/SSE/MathFunctions.h | 2 +- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++-- Eigen/src/Core/products/GeneralMatrixVector.h | 2 +- Eigen/src/Core/products/Parallelizer.h | 2 +- Eigen/src/Core/products/SelfadjointMatrixVector.h | 2 +- Eigen/src/Core/util/Macros.h | 4 ++-- Eigen/src/Core/util/Memory.h | 2 +- Eigen/src/Core/util/Meta.h | 2 +- Eigen/src/Eigenvalues/ComplexEigenSolver.h | 2 +- Eigen/src/Eigenvalues/ComplexSchur.h | 2 +- Eigen/src/Eigenvalues/EigenSolver.h | 2 +- .../Eigenvalues/GeneralizedSelfAdjointEigenSolver.h | 2 +- Eigen/src/Eigenvalues/RealQZ.h | 2 +- Eigen/src/Eigenvalues/RealSchur.h | 2 +- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 2 +- Eigen/src/Geometry/Scaling.h | 2 +- Eigen/src/IterativeLinearSolvers/IncompleteLUT.h | 4 ++-- Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h | 2 +- Eigen/src/KLUSupport/KLUSupport.h | 2 +- Eigen/src/LU/FullPivLU.h | 2 +- Eigen/src/LU/PartialPivLU.h | 4 ++-- Eigen/src/OrderingMethods/Eigen_Colamd.h | 6 +++--- Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 +- Eigen/src/PardisoSupport/PardisoSupport.h | 2 +- Eigen/src/QR/ColPivHouseholderQR.h | 2 +- Eigen/src/QR/CompleteOrthogonalDecomposition.h | 2 +- Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 2 +- Eigen/src/SVD/BDCSVD.h | 2 +- Eigen/src/SVD/UpperBidiagonalization.h | 4 ++-- Eigen/src/SparseCholesky/SimplicialCholesky.h | 2 +- Eigen/src/SparseCore/SparseMatrix.h | 6 +++--- Eigen/src/SparseCore/SparseProduct.h | 2 +- Eigen/src/SparseCore/SparseVector.h | 2 +- Eigen/src/SparseLU/SparseLU.h | 2 +- Eigen/src/SparseLU/SparseLU_Memory.h | 2 +- Eigen/src/SparseLU/SparseLU_column_dfs.h | 4 ++-- Eigen/src/SparseLU/SparseLU_gemm_kernel.h | 2 +- Eigen/src/SparseLU/SparseLU_panel_bmod.h | 2 +- Eigen/src/SuperLUSupport/SuperLUSupport.h | 2 +- Eigen/src/UmfPackSupport/UmfPackSupport.h | 2 +- Eigen/src/plugins/IndexedViewMethods.h | 2 +- bench/analyze-blocking-sizes.cpp | 2 +- bench/btl/README | 2 +- bench/btl/generic_bench/bench.hh | 2 +- bench/btl/generic_bench/utils/size_log.hh | 2 +- bench/btl/generic_bench/utils/xy_file.hh | 2 +- bench/btl/libs/ublas/ublas_interface.hh | 2 +- bench/eig33.cpp | 2 +- bench/spbench/spbenchsolver.cpp | 2 +- blas/f2c/ctbmv.c | 2 +- blas/f2c/dtbmv.c | 2 +- blas/f2c/stbmv.c | 2 +- blas/f2c/ztbmv.c | 2 +- blas/level1_impl.h | 2 +- blas/testing/cblat1.f | 2 +- blas/testing/dblat1.f | 2 +- blas/testing/sblat1.f | 2 +- blas/testing/zblat1.f | 2 +- cmake/EigenConfigureTesting.cmake | 4 ++-- cmake/EigenTesting.cmake | 10 +++++----- cmake/FindComputeCpp.cmake | 2 +- cmake/FindEigen3.cmake | 2 +- debug/msvc/eigen_autoexp_part.dat | 2 +- doc/Doxyfile.in | 2 +- doc/FunctionsTakingEigenTypes.dox | 4 ++-- doc/PreprocessorDirectives.dox | 2 +- doc/QuickStartGuide.dox | 2 +- doc/SparseQuickReference.dox | 2 +- doc/TemplateKeyword.dox | 2 +- doc/TopicLazyEvaluation.dox | 2 +- doc/TopicLinearAlgebraDecompositions.dox | 2 +- doc/TopicMultithreading.dox | 2 +- doc/TutorialMapClass.dox | 4 ++-- doc/TutorialSparse.dox | 4 ++-- doc/UnalignedArrayAssert.dox | 4 ++-- doc/UsingNVCC.dox | 2 +- doc/eigendoxy.css | 2 +- doc/special_examples/Tutorial_sparse_example.cpp | 2 +- lapack/CMakeLists.txt | 4 ++-- test/CMakeLists.txt | 2 +- test/bdcsvd.cpp | 2 +- test/eigensolver_complex.cpp | 2 +- test/geo_quaternion.cpp | 2 +- test/main.h | 2 +- test/packetmath.cpp | 2 +- 98 files changed, 122 insertions(+), 122 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 5a6dec8cc..a9bbfe276 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -360,7 +360,7 @@ inline static const char *SimdInstructionSetsInUse(void) { namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to +// we use size_t frequently and we'll never remember to prepend it with std:: every time just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 13a8f6d14..5be58377b 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -247,7 +247,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 814174d47..22e4be75d 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -180,7 +180,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index ebf5590de..362d905d2 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignement kind defined in this file: +// Assignment kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; @@ -899,7 +899,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instanciation if not needed + // NOTE The following two functions are templated to avoid their instantiation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index fd933eed4..53b427b17 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -395,7 +395,7 @@ template class DenseBase * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. * - * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 7958feeb9..9e58fbf88 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -61,7 +61,7 @@ struct plain_array #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) #elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. + // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 1b864a405..e981129b2 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -749,7 +749,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementatin of is* functions +// Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index e94c8ee96..570283d90 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -75,10 +75,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only usefull when + * expressions have this flag. Therefore, noalias() is only useful when * the source expression contains a matrix product. * - * Here are some examples where noalias is usefull: + * Here are some examples where noalias is useful: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 1dc7e223a..6c0a42ec7 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -780,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type resize(size); } - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 676c48027..3d67d9489 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -116,7 +116,7 @@ class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Convertion to scalar for inner-products */ +/** Conversion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 19c17bb4a..8798deca5 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -84,7 +84,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be usefull when the target matrix expression is complex, e.g.: + // might be useful when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index ed80da36a..ab73fcf21 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -470,7 +470,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -496,7 +496,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index b3f1ea199..31bb896ca 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -434,7 +434,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data } #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +// We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD @@ -500,7 +500,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& f vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +// We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 7b5f948e1..4af2c6cae 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -242,7 +242,7 @@ Packet2d pexp(const Packet2d& _x) return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); } -/* evaluation of 4 sines at onces, using SSE2 intrinsics. +/* evaluation of 4 sines at once, using SSE2 intrinsics. The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce5..9072d0ff3 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1523,7 +1523,7 @@ void gebp_kernel::half SResPacketHalf; @@ -1924,7 +1924,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 41d8242e1..b2a71bc6f 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -201,7 +201,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 3fd180e6c..67390f1d7 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: - * This algorithm processes 2 columns at onces that allows to both reduce + * This algorithm processes 2 columns at once that allows to both reduce * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e351b7ad9..5872ade26 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -719,7 +719,7 @@ namespace Eigen { #error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. #endif -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated // They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 #if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES @@ -778,7 +778,7 @@ namespace Eigen { #endif // At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) +// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) // and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). // Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index c455f92a1..006b0bfba 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -703,7 +703,7 @@ template void swap(scoped_array &a,scoped_array &b) * - 32 bytes alignment if AVX is enabled. * - 64 bytes alignment if AVX512 is enabled. * -* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented * \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0fa818008..998b8921a 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -272,7 +272,7 @@ template<> struct numeric_limits #endif /** \internal - * A base class do disable default copy ctor and copy assignement operator. + * A base class do disable default copy ctor and copy assignment operator. */ class noncopyable { diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index dc5fae06a..081e918f1 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h index 7f38919f7..b8b3490c6 100644 --- a/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index f205b185d..997bebe7b 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index 5f6bb8289..d0f9091be 100644 --- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * Accoring to \p options, this function computes eigenvalues and (if requested) + * According to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index b3a910dd9..e2b37f40e 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -161,7 +161,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index f5c86041d..9e71f3040 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 9ddd553f2..040f8d3bb 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -337,7 +337,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index 8d9acf252..df650fda6 100755 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -128,7 +128,7 @@ public: /** Concatenates a linear transformation matrix and a uniform scaling * \relates UniformScaling */ -// NOTE this operator is defiend in MatrixBase and not as a friend function +// NOTE this operator is defined in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC template EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 338e6f10a..43bd8e8f6 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase::analyzePattern(const _MatrixType& amat) SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice. - // on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered... + // on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred... SparseMatrix AtA = mat2 + mat1; AMDOrdering ordering; ordering(AtA,m_P); diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 7c2326eb7..bfeee71cd 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -275,7 +275,7 @@ public: const Preconditioner& preconditioner() const { return m_preconditioner; } /** \returns the max number of iterations. - * It is either the value setted by setMaxIterations or, by default, + * It is either the value set by setMaxIterations or, by default, * twice the number of columns of the matrix. */ Index maxIterations() const diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h index a9e8633d9..d2633a935 100644 --- a/Eigen/src/KLUSupport/KLUSupport.h +++ b/Eigen/src/KLUSupport/KLUSupport.h @@ -106,7 +106,7 @@ class KLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index ec61086d5..50d1bb41b 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -48,7 +48,7 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index d43961887..bfcd2c95b 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -420,8 +420,8 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index da85b4d6e..67fcad3f7 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -1493,7 +1493,7 @@ static inline void order_children c = Col [c].shared1.parent ; /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ + /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ } while (Col [c].shared2.order == COLAMD_EMPTY) ; @@ -1638,7 +1638,7 @@ static void detect_super_cols COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ + /* no gather scatter necessary */ if (*cp1++ != *cp2++) { break ; @@ -1688,7 +1688,7 @@ static void detect_super_cols /* Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns + all available memory has been used while performing row merging. Returns the index of the first free position in A, after garbage collection. The time taken by this routine is linear is the size of the array A, which is itself linear in the number of nonzeros in the input matrix. diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index 160d8a523..37426877a 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the PaStiX reports a problem * \c InvalidInput if the input matrix is invalid * diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 091c3970e..fb2ba04b4 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -140,7 +140,7 @@ class PardisoImpl : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 5270eaca2..ed47b05e3 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was succesful. + /** \brief Reports whether the QR factorization was successful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 13b61fcdb..880becb25 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -353,7 +353,7 @@ class CompleteOrthogonalDecomposition { inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * succesful. + * successful. * * \note This function always returns \c Success. It is provided for * compatibility diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 953d57c9d..1a5c5254e 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -220,7 +220,7 @@ class SPQR : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the sparse QR can not be computed */ ComputationInfo info() const diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 06865a331..a24deb96a 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -62,7 +62,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 11ac847e1..0526ac931 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the begining of column k of Y as a temporary vector + // let's use the beginning of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the begining of column k of X as a temporary vectors + // let's use the beginning of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 2907f6529..b9ca94bc3 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -101,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 323c2323b..8f77194b6 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -21,7 +21,7 @@ namespace Eigen { * This class implements a more versatile variants of the common \em compressed row/column storage format. * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index. * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra - * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero + * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero * can be done with limited memory reallocation and copies. * * A call to the function makeCompressed() turns the matrix into the standard \em compressed format @@ -503,7 +503,7 @@ class SparseMatrix } } - /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */ + /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { prune(default_prunning_func(reference,epsilon)); @@ -986,7 +986,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa * * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather - * be explicitely stored into a std::vector for instance. + * be explicitly stored into a std::vector for instance. */ template template diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h index 4cbf68781..c495a7398 100644 --- a/Eigen/src/SparseCore/SparseProduct.h +++ b/Eigen/src/SparseCore/SparseProduct.h @@ -17,7 +17,7 @@ namespace Eigen { * The automatic pruning of the small values can be achieved by calling the pruned() function * in which case a totally different product algorithm is employed: * \code - * C = (A*B).pruned(); // supress numerical zeros (exact) + * C = (A*B).pruned(); // suppress numerical zeros (exact) * C = (A*B).pruned(ref); * C = (A*B).pruned(ref,epsilon); * \endcode diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 19b0fbc9d..05779be68 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -281,7 +281,7 @@ class SparseVector } /** Swaps the values of \c *this and \a other. - * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only. + * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only. * \sa SparseMatrixBase::swap() */ inline void swap(SparseVector& other) diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index f883ab383..383a203b4 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase >, /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h index 4dc42e87b..349bfd585 100644 --- a/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/Eigen/src/SparseLU/SparseLU_Memory.h @@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w) /** - * Expand the existing storage to accomodate more fill-ins + * Expand the existing storage to accommodate more fill-ins * \param vec Valid pointer to the vector to allocate or expand * \param[in,out] length At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector * \param[in] nbElts Current number of elements in the factors diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h index c98b30e32..5a2c941b4 100644 --- a/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -151,7 +151,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j StorageIndex ito = glu.xlsub(fsupc+1); glu.xlsub(jcolm1) = ito; StorageIndex istop = ito + jptr - jm1ptr; - xprune(jcolm1) = istop; // intialize xprune(jcol-1) + xprune(jcolm1) = istop; // initialize xprune(jcol-1) glu.xlsub(jcol) = istop; for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) @@ -166,7 +166,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j // Tidy up the pointers before exit glu.xsup(nsuper+1) = jcolp1; glu.supno(jcolp1) = nsuper; - xprune(jcol) = StorageIndex(nextl); // Intialize upper bound for pruning + xprune(jcol) = StorageIndex(nextl); // Initialize upper bound for pruning glu.xlsub(jcolp1) = StorageIndex(nextl); return 0; diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index 95ba7413f..e37c2fe0d 100644 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ pstore(C0+i+(I)*PacketSize, c0); - // agressive vectorization and peeling + // aggressive vectorization and peeling for(Index i=0; i /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 9568cc1d5..c636f17ac 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -201,7 +201,7 @@ class UmfPackLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index a7ec63adf..9ad2d9aee 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -112,7 +112,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND #if EIGEN_HAS_STATIC_ARRAY_TEMPLATE -// The folowing three overloads are needed to handle raw Index[N] arrays. +// The following three overloads are needed to handle raw Index[N] arrays. template IndexedView::type> diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index d563a1d2d..6bc4aca3d 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -825,7 +825,7 @@ int main(int argc, char* argv[]) } for (int i = 1; i < argc; i++) { bool arg_handled = false; - // Step 1. Try to match action invokation names. + // Step 1. Try to match action invocation names. for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { if (!strcmp(argv[i], (*it)->invokation_name())) { if (!action) { diff --git a/bench/btl/README b/bench/btl/README index f3f5fb36f..ebed88960 100644 --- a/bench/btl/README +++ b/bench/btl/README @@ -36,7 +36,7 @@ For instance: You can also select a given set of actions defining the environment variable BTL_CONFIG this way: BTL_CONFIG="-a action1{:action2}*" ctest -V -An exemple: +An example: BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata" ctest -V -R eigen2 Finally, if bench results already exist (the bench*.dat files) then they merges by keeping the best for each matrix size. If you want to overwrite the previous ones you can simply add the "--overwrite" option: diff --git a/bench/btl/generic_bench/bench.hh b/bench/btl/generic_bench/bench.hh index 7b7b951b5..0732940d5 100644 --- a/bench/btl/generic_bench/bench.hh +++ b/bench/btl/generic_bench/bench.hh @@ -159,7 +159,7 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ){ // bench(size_min,size_max,nb_point); - // Only for small problem size. Otherwize it will be too long + // Only for small problem size. Otherwise it will be too long // bench(size_min,size_max,nb_point); // bench(size_min,size_max,nb_point); diff --git a/bench/btl/generic_bench/utils/size_log.hh b/bench/btl/generic_bench/utils/size_log.hh index 13a3da7a8..68945e7cc 100644 --- a/bench/btl/generic_bench/utils/size_log.hh +++ b/bench/btl/generic_bench/utils/size_log.hh @@ -23,7 +23,7 @@ #include "math.h" // The Vector class must satisfy the following part of STL vector concept : // resize() method -// [] operator for seting element +// [] operator for setting element // the vector element are int compatible. template void size_log(const int nb_point, const int size_min, const int size_max, Vector & X) diff --git a/bench/btl/generic_bench/utils/xy_file.hh b/bench/btl/generic_bench/utils/xy_file.hh index 4571bed8f..0492faf09 100644 --- a/bench/btl/generic_bench/utils/xy_file.hh +++ b/bench/btl/generic_bench/utils/xy_file.hh @@ -55,7 +55,7 @@ bool read_xy_file(const std::string & filename, std::vector & tab_sizes, // The Vector class must satisfy the following part of STL vector concept : // resize() method -// [] operator for seting element +// [] operator for setting element // the vector element must have the << operator define using namespace std; diff --git a/bench/btl/libs/ublas/ublas_interface.hh b/bench/btl/libs/ublas/ublas_interface.hh index 95cad5195..f59b7cf2f 100644 --- a/bench/btl/libs/ublas/ublas_interface.hh +++ b/bench/btl/libs/ublas/ublas_interface.hh @@ -100,7 +100,7 @@ public : Y+=coef*X; } - // alias free assignements + // alias free assignments static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ X.assign(prod(A,B)); diff --git a/bench/eig33.cpp b/bench/eig33.cpp index 47947a9be..f003d8a53 100644 --- a/bench/eig33.cpp +++ b/bench/eig33.cpp @@ -101,7 +101,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals) computeRoots(scaledMat,evals); // compute the eigen vectors - // **here we assume 3 differents eigenvalues** + // **here we assume 3 different eigenvalues** // "optimized version" which appears to be slower with gcc! // Vector base; diff --git a/bench/spbench/spbenchsolver.cpp b/bench/spbench/spbenchsolver.cpp index 4acd0039c..2a7351124 100644 --- a/bench/spbench/spbenchsolver.cpp +++ b/bench/spbench/spbenchsolver.cpp @@ -54,7 +54,7 @@ int main(int argc, char ** args) statbuf.close(); } else - std::cerr << "Unable to open the provided file for writting... \n"; + std::cerr << "Unable to open the provided file for writing... \n"; } // Get the maximum number of iterations and the tolerance diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c index 790fd581f..a6e0dae80 100644 --- a/blas/f2c/ctbmv.c +++ b/blas/f2c/ctbmv.c @@ -147,7 +147,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c index fdf73ebb5..aa67d19da 100644 --- a/blas/f2c/dtbmv.c +++ b/blas/f2c/dtbmv.c @@ -143,7 +143,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c index fcf9ce336..b5a68b545 100644 --- a/blas/f2c/stbmv.c +++ b/blas/f2c/stbmv.c @@ -143,7 +143,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c index 4cdcd7f88..3bf0beb01 100644 --- a/blas/f2c/ztbmv.c +++ b/blas/f2c/ztbmv.c @@ -147,7 +147,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/level1_impl.h b/blas/level1_impl.h index f857bfa20..6e7f8c976 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -33,7 +33,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - // be carefull, *incx==0 is allowed !! + // be careful, *incx==0 is allowed !! if(*incx==1 && *incy==1) make_vector(y,*n) = make_vector(x,*n); else diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f index 8ca67fb19..73015f5a9 100644 --- a/blas/testing/cblat1.f +++ b/blas/testing/cblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/dblat1.f b/blas/testing/dblat1.f index 30691f9bf..03d9f1345 100644 --- a/blas/testing/dblat1.f +++ b/blas/testing/dblat1.f @@ -990,7 +990,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/sblat1.f b/blas/testing/sblat1.f index 6657c2693..4d43d9b48 100644 --- a/blas/testing/sblat1.f +++ b/blas/testing/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f index d30112c63..c00b67dc8 100644 --- a/blas/testing/zblat1.f +++ b/blas/testing/zblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index afc24b5e9..a2a4f54b9 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -20,7 +20,7 @@ include(CTest) set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests") # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests. -# Recall that our unit tests are not in the "all" target, so we have to explicitely ask ctest to build our custom 'buildtests' target. +# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target. # At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable. file(READ "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE) # try to grab the default flags @@ -39,7 +39,7 @@ ei_init_testing() # configure Eigen related testing options option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF) -option(EIGEN_DEBUG_ASSERTS "Enable advanced debuging of assertions" OFF) +option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF) if(CMAKE_COMPILER_IS_GNUCXX) option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 4a34ddef5..16d6d279f 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -247,7 +247,7 @@ endmacro(ei_add_test_internal_sycl) # # If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables # test__ -# where N runs from 1 to the greatest occurence found in the source file. Each of these +# where N runs from 1 to the greatest occurrence found in the source file. Each of these # executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests # into smaller executables. # @@ -269,8 +269,8 @@ macro(ei_add_test testname) file(READ "${filename}" test_source) set(parts 0) string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") list(REMOVE_DUPLICATES suffixes) if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) add_custom_target(${testname}) @@ -303,8 +303,8 @@ macro(ei_add_test_sycl testname) file(READ "${filename}" test_source) set(parts 0) string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") list(REMOVE_DUPLICATES suffixes) if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) add_custom_target(${testname}) diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index e61dedc46..29f2a5007 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -243,7 +243,7 @@ endfunction() ####################### # # Adds a SYCL compilation custom command associated with an existing -# target and sets a dependancy on that new command. +# target and sets a dependency on that new command. # # targetName : Name of the target to add a SYCL to. # binaryDir : Intermediate directory to output the integration header. diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake index 657440ba5..52efb4e15 100644 --- a/cmake/FindEigen3.cmake +++ b/cmake/FindEigen3.cmake @@ -15,7 +15,7 @@ # Eigen3::Eigen - The header-only Eigen library # # This module reads hints about search locations from -# the following enviroment variables: +# the following environment variables: # # EIGEN3_ROOT # EIGEN3_ROOT_DIR diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat index 07aa43739..35ef5807c 100644 --- a/debug/msvc/eigen_autoexp_part.dat +++ b/debug/msvc/eigen_autoexp_part.dat @@ -14,7 +14,7 @@ ; * - Eigen::Matrix<*,-1,+,*,*,*> ; * - Eigen::Matrix<*,+,+,*,*,*> ; * -; * Matrices are displayed properly independantly of the memory +; * Matrices are displayed properly independently of the memory ; * alignment (RowMajor vs. ColMajor). ; * ; * This file is distributed WITHOUT ANY WARRANTY. Please ensure diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 2109978fe..49b9fba39 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1764,7 +1764,7 @@ UML_LOOK = YES # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more -# managable. Set this to 0 for no limit. Note that the threshold may be +# manageable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox index 152dda47d..e054714f9 100644 --- a/doc/FunctionsTakingEigenTypes.dox +++ b/doc/FunctionsTakingEigenTypes.dox @@ -133,7 +133,7 @@ In this special case, the example is fine and will be working because both param \section TopicPlainFunctionsFailing In which cases do functions taking a plain Matrix or Array argument fail? -Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const paramter which allows us to store the result. A first naive implementation might look as follows. +Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const parameter which allows us to store the result. A first naive implementation might look as follows. \code // Note: This code is flawed! void cov(const MatrixXf& x, const MatrixXf& y, MatrixXf& C) @@ -176,7 +176,7 @@ The implementation above does now not only work with temporary expressions but i \section TopicResizingInGenericImplementations How to resize matrices in generic implementations? -One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the follwing code to work +One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the following code to work \code MatrixXf x = MatrixXf::Random(100,3); MatrixXf y = MatrixXf::Random(100,3); diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index b6d08c700..b49f7d3cf 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -51,7 +51,7 @@ are doing. \section TopicPreprocessorDirectivesCppVersion C++ standard features -By default, %Eigen strive to automatically detect and enable langage features at compile-time based on +By default, %Eigen strive to automatically detect and enable language features at compile-time based on the information provided by the compiler. - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER. diff --git a/doc/QuickStartGuide.dox b/doc/QuickStartGuide.dox index ea32c3b3d..23bb2981b 100644 --- a/doc/QuickStartGuide.dox +++ b/doc/QuickStartGuide.dox @@ -68,7 +68,7 @@ The output is as follows: The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetics. -The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left unitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows: +The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left uninitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows: \f[ v = diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index a25622e80..81a73eec2 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -80,7 +80,7 @@ sm1.setZero(); \section SparseBasicInfos Matrix properties -Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some informations from the matrix. +Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some information from the matrix. + + + + + + + + + + + + diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox index cb92ceeae..a72724143 100644 --- a/doc/TutorialLinearAlgebra.dox +++ b/doc/TutorialLinearAlgebra.dox @@ -73,7 +73,7 @@ depending on your matrix and the trade-off you want to make: - + @@ -85,6 +85,14 @@ depending on your matrix and the trade-off you want to make: + + + + + + + + @@ -101,15 +109,24 @@ depending on your matrix and the trade-off you want to make: + + + + + + + + - +
\code diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox index b84cfdae9..fbf2c7081 100644 --- a/doc/TemplateKeyword.dox +++ b/doc/TemplateKeyword.dox @@ -76,7 +76,7 @@ point where the template is defined, without knowing the actual value of the tem and \c Derived2 in the example). That means that the compiler cannot know that dst.triangularView is a member template and that the following < symbol is part of the delimiter for the template parameter. Another possibility would be that dst.triangularView is a member variable with the < -symbol refering to the operator<() function. In fact, the compiler should choose the second +symbol referring to the operator<() function. In fact, the compiler should choose the second possibility, according to the standard. If dst.triangularView is a member template (as in our case), the programmer should specify this explicitly with the \c template keyword and write dst.template triangularView. diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox index 101ef8c72..b7820e3e6 100644 --- a/doc/TopicLazyEvaluation.dox +++ b/doc/TopicLazyEvaluation.dox @@ -58,7 +58,7 @@ the product matrix3 * matrix4 gets evaluated immediately into a tempora \code matrix1 = matrix2 * (matrix3 + matrix4); \endcode -Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum everytime, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. +Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. */ diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 491470627..991f964cc 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -248,7 +248,7 @@ To get an overview of the true relative speed of the different decomposition, ch
Blocking
Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.
Implicit Multi Threading (MT)
-
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product rountines.
+
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.
Explicit Multi Threading (MT)
Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.
Meta-unroller
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox index 47c9b261f..bc394f484 100644 --- a/doc/TopicMultithreading.dox +++ b/doc/TopicMultithreading.dox @@ -47,7 +47,7 @@ int main(int argc, char** argv) \warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature. -In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section. +In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallelization as detailed in the previous section. */ diff --git a/doc/TutorialMapClass.dox b/doc/TutorialMapClass.dox index f8fb0fd2f..caa2539d8 100644 --- a/doc/TutorialMapClass.dox +++ b/doc/TutorialMapClass.dox @@ -29,9 +29,9 @@ Map mi(pi); \endcode where \c pi is an \c int \c *. In this case the size does not have to be passed to the constructor, because it is already specified by the Matrix/Array type. -Note that Map does not have a default constructor; you \em must pass a pointer to intialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew). +Note that Map does not have a default constructor; you \em must pass a pointer to initialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew). -Map is flexible enough to accomodate a variety of different data representations. There are two other (optional) template parameters: +Map is flexible enough to accommodate a variety of different data representations. There are two other (optional) template parameters: \code Map mat(rows,cols); // default is column major 2: mat.reserve(VectorXi::Constant(cols,6)); diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox index 0f7022973..8676faa1b 100644 --- a/doc/UnalignedArrayAssert.dox +++ b/doc/UnalignedArrayAssert.dox @@ -117,8 +117,8 @@ It doesn't disable 16-byte alignment, because that would mean that vectorized an \section checkmycode How can I check my code is safe regarding alignment issues? -Unfortunately, there is no possibility in C++ to detect any of the aformentioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). -Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the begining of this page. +Unfortunately, there is no possibility in C++ to detect any of the aforementioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). +Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page. Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default. The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink. diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox index 9bcdf0bfc..36beb2ddd 100644 --- a/doc/UsingNVCC.dox +++ b/doc/UsingNVCC.dox @@ -5,7 +5,7 @@ namespace Eigen { Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code. This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header. -This might be usefull to disable some warnings when a .cu file makes use of Eigen on the host side only. +This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only. However, in both cases, host's SIMD vectorization has to be disabled in .cu files. It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files. diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css index 6ce2b839b..b99d7914a 100644 --- a/doc/eigendoxy.css +++ b/doc/eigendoxy.css @@ -93,7 +93,7 @@ table th.inter { border-color: #cccccc; } -/** class for exemple / output tables **/ +/** class for example / output tables **/ table.example { } diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp index 830e196ea..89937b411 100644 --- a/doc/special_examples/Tutorial_sparse_example.cpp +++ b/doc/special_examples/Tutorial_sparse_example.cpp @@ -12,7 +12,7 @@ int main(int argc, char** argv) assert(argc==2); int n = 300; // size of the image - int m = n*n; // number of unknows (=number of pixels) + int m = n*n; // number of unknowns (=number of pixels) // Assembly: std::vector coefficients; // list of non-zeros coefficients diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 9883d4c72..52f18edfc 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -35,7 +35,7 @@ set(EigenLapack_SRCS ${EigenLapack_SRCS} second_NONE.f dsecnd_NONE.f ) -option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enbale the Lapack unit tests") +option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests") if(EIGEN_ENABLE_LAPACK_TESTS) @@ -59,7 +59,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS) message(STATUS "Setup lapack reference and lapack unit tests") execute_process(COMMAND tar xzf "lapack_addons_3.4.1.tgz" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) else() - message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests wont be enabled") + message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests won't be enabled") set(EIGEN_ENABLE_LAPACK_TESTS false) endif() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8bd086ce3..8bcf3f7c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,5 @@ # generate split test header file only if it does not yet exist -# in order to prevent a rebuild everytime cmake is configured +# in order to prevent a rebuild every time cmake is configured if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") foreach(i RANGE 1 999) diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index 6c7b09696..109218766 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -104,7 +104,7 @@ void test_bdcsvd() CALL_SUBTEST_7( BDCSVD(10,10) ); // Check that preallocation avoids subsequent mallocs - // Disbaled because not supported by BDCSVD + // Disabled because not supported by BDCSVD // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp index 293b1b265..03d5774ef 100644 --- a/test/eigensolver_complex.cpp +++ b/test/eigensolver_complex.cpp @@ -47,7 +47,7 @@ template bool find_pivot(typename MatrixType::Scalar tol, M return false; } -/* Check that two column vectors are approximately equal upto permutations. +/* Check that two column vectors are approximately equal up to permutations. * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(), * however this strategy is numerically inacurate because of numerical cancellation issues. */ diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 8ee8fdb27..5854d39c5 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -241,7 +241,7 @@ template void mapQuaternion(void){ const MQuaternionUA& cmq3(mq3); VERIFY( &cmq3.x() == &mq3.x() ); // FIXME the following should be ok. The problem is that currently the LValueBit flag - // is used to determine wether we can return a coeff by reference or not, which is not enough for Map. + // is used to determine whether we can return a coeff by reference or not, which is not enough for Map. //const MCQuaternionUA& cmcq3(mcq3); //VERIFY( &cmcq3.x() == &mcq3.x() ); } diff --git a/test/main.h b/test/main.h index 6079cbd06..14f4e3e7a 100644 --- a/test/main.h +++ b/test/main.h @@ -183,7 +183,7 @@ namespace Eigen }; } // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while - // one should have been, then the list of excecuted assertions is printed out. + // one should have been, then the list of executed assertions is printed out. // // EIGEN_DEBUG_ASSERTS is not enabled by default as it // significantly increases the compilation time diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 08b360340..3c11df7e8 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -28,7 +28,7 @@ template T negate(const T& x) { return -x; } } } -// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU. +// NOTE: we disable inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU. template EIGEN_DONT_INLINE bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits::Real& refvalue) { From baf9a5a776b45a2d9f330afb4c716db16c2a3fb8 Mon Sep 17 00:00:00 2001 From: vhuber Date: Fri, 30 Mar 2018 18:53:34 +0200 Subject: [PATCH 265/680] Add interface to umfpack_*l_* functions --- Eigen/src/UmfPackSupport/UmfPackSupport.h | 186 ++++++++++++++++++---- 1 file changed, 156 insertions(+), 30 deletions(-) diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 9568cc1d5..fa62636e9 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -12,47 +12,92 @@ namespace Eigen { +#define UF_long __int64 + /* TODO extract L, extract U, compute det, etc... */ // generic double/complex wrapper functions: -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) + // Defaults +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) { umfpack_di_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_defaults(control); } -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, UF_long) +{ umfpack_dl_defaults(control); } + +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, UF_long) +{ umfpack_zl_defaults(control); } + +// Report info +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int) { umfpack_di_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, int) { umfpack_zi_report_info(control, info);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, UF_long) +{ umfpack_dl_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, UF_long) +{ umfpack_zl_report_info(control, info);} + +// Report status +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int) { umfpack_di_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, int) { umfpack_zi_report_status(control, status);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], double) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, UF_long) +{ umfpack_dl_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, UF_long) +{ umfpack_zl_report_status(control, status);} + +// report control +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) { umfpack_di_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_report_control(control);} -inline void umfpack_free_numeric(void **Numeric, double) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, UF_long) +{ umfpack_dl_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, UF_long) +{ umfpack_zl_report_control(control);} + +// Free numeric +inline void umfpack_free_numeric(void **Numeric, double, int) { umfpack_di_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, std::complex) +inline void umfpack_free_numeric(void **Numeric, std::complex, int) { umfpack_zi_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_symbolic(void **Symbolic, double) +inline void umfpack_free_numeric(void **Numeric, double, UF_long) +{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; } + +inline void umfpack_free_numeric(void **Numeric, std::complex, UF_long) +{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; } + +// Free symbolic +inline void umfpack_free_symbolic(void **Symbolic, double, int) { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, std::complex) +inline void umfpack_free_symbolic(void **Symbolic, std::complex, int) { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; } +inline void umfpack_free_symbolic(void **Symbolic, double, UF_long) +{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; } + +inline void umfpack_free_symbolic(void **Symbolic, std::complex, UF_long) +{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; } + +// Symbolic inline int umfpack_symbolic(int n_row,int n_col, const int Ap[], const int Ai[], const double Ax[], void **Symbolic, const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) @@ -66,7 +111,21 @@ inline int umfpack_symbolic(int n_row,int n_col, { return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } +inline int umfpack_symbolic(UF_long n_row,UF_long n_col, + const UF_long Ap[], const UF_long Ai[], const double Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info); +} +inline int umfpack_symbolic(UF_long n_row,UF_long n_col, + const UF_long Ap[], const UF_long Ai[], const std::complex Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); +} + +// Numeric inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[], void *Symbolic, void **Numeric, const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) @@ -80,7 +139,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex Ax[], + void *Symbolic, void **Numeric, + const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) +{ + return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info); +} + +// solve inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[], double X[], const double B[], void *Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) @@ -95,6 +168,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } +inline int umfpack_solve( int sys, const UF_long Ap[], const UF_long Ai[], const double Ax[], + double X[], const double B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info); +} + +inline int umfpack_solve( int sys, const UF_long Ap[], const UF_long Ai[], const std::complex Ax[], + std::complex X[], const std::complex B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); +} + +// Get Lunz inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double) { return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); @@ -105,6 +193,17 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_ return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } +inline int umfpack_get_lunz(UF_long *lnz, UF_long *unz, UF_long *n_row, UF_long *n_col, UF_long *nz_udiag, void *Numeric, double) +{ + return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +inline int umfpack_get_lunz(UF_long *lnz, UF_long *unz, UF_long *n_row, UF_long *n_col, UF_long *nz_udiag, void *Numeric, std::complex) +{ + return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +// Get Numeric inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[], int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric) { @@ -120,18 +219,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex Lx[], in return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, Dx?&dx0_real:0,0,do_recip,Rs,Numeric); } +inline int umfpack_get_numeric(UF_long Lp[], UF_long Lj[], double Lx[], UF_long Up[], UF_long Ui[], double Ux[], + UF_long P[], UF_long Q[], double Dx[], UF_long *do_recip, double Rs[], void *Numeric) +{ + return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric); +} -inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline int umfpack_get_numeric(UF_long Lp[], UF_long Lj[], std::complex Lx[], UF_long Up[], UF_long Ui[], std::complex Ux[], + UF_long P[], UF_long Q[], std::complex Dx[], UF_long *do_recip, double Rs[], void *Numeric) +{ + double& lx0_real = numext::real_ref(Lx[0]); + double& ux0_real = numext::real_ref(Ux[0]); + double& dx0_real = numext::real_ref(Dx[0]); + return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, + Dx?&dx0_real:0,0,do_recip,Rs,Numeric); +} + +// Get Determinant +inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { double& mx_real = numext::real_ref(*Mx); return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); } +inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], UF_long) +{ + return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info); +} + +inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], UF_long) +{ + double& mx_real = numext::real_ref(*Mx); + return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); +} + /** \ingroup UmfPackSupport_Module * \brief A sparse LU factorization and solver based on UmfPack @@ -164,7 +290,7 @@ class UmfPackLU : public SparseSolverBase > typedef Matrix IntRowVectorType; typedef Matrix IntColVectorType; typedef SparseMatrix LUMatrixType; - typedef SparseMatrix UmfpackMatrixType; + typedef SparseMatrix UmfpackMatrixType; typedef Ref UmfpackMatrixRef; enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -192,8 +318,8 @@ class UmfPackLU : public SparseSolverBase > ~UmfPackLU() { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex()); } inline Index rows() const { return mp_matrix.rows(); } @@ -241,8 +367,8 @@ class UmfPackLU : public SparseSolverBase > template void compute(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); analyzePattern_impl(); factorize_impl(); @@ -257,8 +383,8 @@ class UmfPackLU : public SparseSolverBase > template void analyzePattern(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); @@ -309,7 +435,7 @@ class UmfPackLU : public SparseSolverBase > { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); if(m_numeric) - umfpack_free_numeric(&m_numeric,Scalar()); + umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); @@ -322,7 +448,7 @@ class UmfPackLU : public SparseSolverBase > */ void printUmfpackControl() { - umfpack_report_control(m_control.data(), Scalar()); + umfpack_report_control(m_control.data(), Scalar(),StorageIndex()); } /** Prints statistics collected by UmfPack. @@ -332,7 +458,7 @@ class UmfPackLU : public SparseSolverBase > void printUmfpackInfo() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar()); + umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex()); } /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization). @@ -341,7 +467,7 @@ class UmfPackLU : public SparseSolverBase > */ void printUmfpackStatus() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar()); + umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex()); } /** \internal */ @@ -362,13 +488,13 @@ class UmfPackLU : public SparseSolverBase > m_symbolic = 0; m_extractedDataAreDirty = true; - umfpack_defaults(m_control.data(), Scalar()); + umfpack_defaults(m_control.data(), Scalar(),StorageIndex()); } void analyzePattern_impl() { - m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), - internal::convert_index(mp_matrix.cols()), + m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), + internal::convert_index(mp_matrix.cols()), mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), &m_symbolic, m_control.data(), m_umfpackInfo.data()); @@ -438,7 +564,7 @@ void UmfPackLU::extractData() const if (m_extractedDataAreDirty) { // get size of the data - int lnz, unz, rows, cols, nz_udiag; + StorageIndex lnz, unz, rows, cols, nz_udiag; umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); // allocate data @@ -464,7 +590,7 @@ template typename UmfPackLU::Scalar UmfPackLU::determinant() const { Scalar det; - umfpack_get_determinant(&det, 0, m_numeric, 0); + umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex()); return det; } From 267a144da5145d735934aab0a249418d39ce3f9b Mon Sep 17 00:00:00 2001 From: vhuber Date: Fri, 30 Mar 2018 23:04:53 +0200 Subject: [PATCH 266/680] Remove unnecessary define --- Eigen/src/UmfPackSupport/UmfPackSupport.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index fa62636e9..ca2f73dbe 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -12,8 +12,6 @@ namespace Eigen { -#define UF_long __int64 - /* TODO extract L, extract U, compute det, etc... */ // generic double/complex wrapper functions: From 524119d32a2ac354a384717da38526087a242eb0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 10:56:10 +0200 Subject: [PATCH 267/680] Fix uninitialized output argument. --- unsupported/Eigen/src/SparseExtra/MarketIO.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h index fc70a24d8..6d57ab2e9 100644 --- a/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h @@ -109,6 +109,7 @@ namespace internal inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector) { sym = 0; + iscomplex = false; isvector = false; std::ifstream in(filename.c_str(),std::ios::in); if(!in) From 6719409cd92b19acabc4544f9ac5571a2ff9a88f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 14:11:56 +0200 Subject: [PATCH 268/680] AVX512: add missing pinsertfirst and pinsertlast, implement pblend for Packet8d, fix compilation without AVX512DQ --- Eigen/src/Core/arch/AVX512/PacketMath.h | 51 ++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 12b897572..abece01bc 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -54,6 +54,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, + HasBlend = 0, #if EIGEN_GNUC_AT_LEAST(5, 3) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, @@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); return pairs; } + +#ifdef EIGEN_VECTORIZE_AVX512DQ // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> @@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); return x; } +#else +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + __m512d x = _mm512_setzero_pd(); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3)); + return x; +} +#endif // Loads 4 floats from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} @@ -1272,11 +1286,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, return Packet16f(); } template <> -EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, - const Packet8d& /*thenPacket*/, - const Packet8d& /*elsePacket*/) { - assert(false && "To be implemented"); - return Packet8d(); +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, + const Packet8d& thenPacket, + const Packet8d& elsePacket) { + __mmask8 m = (ifPacket.select[0] ) + | (ifPacket.select[1]<<1) + | (ifPacket.select[2]<<2) + | (ifPacket.select[3]<<3) + | (ifPacket.select[4]<<4) + | (ifPacket.select[5]<<5) + | (ifPacket.select[6]<<6) + | (ifPacket.select[7]<<7); + return _mm512_mask_blend_pd(m, elsePacket, thenPacket); +} + +template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b) +{ + return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b)); +} + +template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b) +{ + return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b) +{ + return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b)); +} + +template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b) +{ + return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b)); } } // end namespace internal From 7b0630315f343422b37f62f40a039c9e725fe9e1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 14:12:50 +0200 Subject: [PATCH 269/680] AVX512: fix psqrt and prsqrt --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 399be0ee4..4695fbc81 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -266,8 +266,7 @@ psqrt(const Packet16f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), - _mm512_setzero_ps()); + Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x)); // Do a single step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); @@ -289,8 +288,7 @@ psqrt(const Packet8d& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), - _mm512_setzero_pd()); + Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x)); // Do a first step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); @@ -333,20 +331,18 @@ prsqrt(const Packet16f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), - _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); // Fill in NaNs and Infs for the negative/zero entries. __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, p16f_nan, - _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); + neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); // Do a single step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); } template <> @@ -363,14 +359,12 @@ prsqrt(const Packet8d& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), - _mm512_rsqrt14_pd(_x)); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); // Fill in NaNs and Infs for the negative/zero entries. __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, p8d_nan, - _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); + neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); // Do a first step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); @@ -379,7 +373,7 @@ prsqrt(const Packet8d& _x) { x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); } #else template <> From d43b2f01f48d67507b921b8f9ee1fca071ac2ee2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 14:14:00 +0200 Subject: [PATCH 270/680] Fix unit testing of predux_downto4 (bad name), and add unit testing of prsqrt --- test/packetmath.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 3c11df7e8..31a524af3 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -248,12 +248,13 @@ template void packetmath() VERIFY(isApproxAbs(ref[0], internal::predux(internal::pload(data1)), refvalue) && "internal::predux"); { - for (int i=0; i<4; ++i) + int HalfPacketSize = PacketSize>4 ? PacketSize/2 : PacketSize; + for (int i=0; i(data1))); - VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux_downto4"); + VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_downto4"); } ref[0] = 1; @@ -436,6 +437,7 @@ template void packetmath_real() if(internal::random(0,1)<0.1f) data1[internal::random(0, PacketSize)] = 0; CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt); + CHECK_CWISE1_IF(PacketTraits::HasSqrt, Scalar(1)/std::sqrt, internal::prsqrt); CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog); #if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L) CHECK_CWISE1_IF(PacketTraits::HasExpm1, std::expm1, internal::pexpm1); From 67bac6368cc6cdf6c1f420c53cff73878b3d86db Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 14:19:04 +0200 Subject: [PATCH 271/680] protect calls to isnan --- unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 7deca3e12..97b2e5a91 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -535,7 +535,7 @@ struct igammac_impl { return nan; } - if (numext::isnan(a) || numext::isnan(x)) { // propagate nans + if ((numext::isnan)(a) || (numext::isnan)(x)) { // propagate nans return nan; } @@ -728,7 +728,7 @@ struct igamma_impl { return nan; } - if (numext::isnan(a) || numext::isnan(x)) { // propagate nans + if ((numext::isnan)(a) || (numext::isnan)(x)) { // propagate nans return nan; } From 584951ca4d3f822470895370c2de087baa5ca751 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 14:28:38 +0200 Subject: [PATCH 272/680] Rename predux_downto4 to be more accurate on its semantic. --- Eigen/src/Core/GenericPacketMath.h | 4 ++-- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- Eigen/src/Core/arch/AVX512/PacketMath.h | 4 ++-- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- test/packetmath.cpp | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 30878eda6..7979c3aff 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -324,13 +324,13 @@ preduxp(const Packet* vecs) { return vecs[0]; } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) { return a; } -/** \internal \returns the sum of the elements of \a a by block of 4 elements. +/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} * For packet-size smaller or equal to 4, this boils down to a noop. */ template EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_downto4(const Packet& a) +predux_half_dowto4(const Packet& a) { return a; } /** \internal \returns the product of the elements of \a a*/ diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 636230944..af31023f0 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -412,7 +412,7 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } -template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index abece01bc..eb5de43d4 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -888,7 +888,7 @@ EIGEN_STRONG_INLINE double predux(const Packet8d& a) { } template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ __m256 lane0 = _mm512_extractf32x8_ps(a, 0); __m256 lane1 = _mm512_extractf32x8_ps(a, 1); @@ -904,7 +904,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { #endif } template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { +EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { __m256d lane0 = _mm512_extractf64x4_pd(a, 0); __m256d lane1 = _mm512_extractf64x4_pd(a, 1); __m256d res = _mm256_add_pd(lane0, lane1); diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 9072d0ff3..dd4f366ee 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -580,7 +580,7 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket -const DoublePacket& predux_downto4(const DoublePacket &a) +const DoublePacket& predux_half_dowto4(const DoublePacket &a) { return a; } @@ -1596,13 +1596,13 @@ void gebp_kernel void packetmath() ref[i] = 0; for (int i=0; i(data1))); - VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_downto4"); + internal::pstore(data2, internal::predux_half_dowto4(internal::pload(data1))); + VERIFY(areApprox(ref, data2, HalfPacketSize) && "internal::predux_half_dowto4"); } ref[0] = 1; From 40b4bf3d32265152ae26c2f630f6f70e5d676f3d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 14:36:27 +0200 Subject: [PATCH 273/680] AVX512: _mm512_rsqrt28_ps is available for AVX512ER only --- Eigen/Core | 3 +++ Eigen/src/Core/arch/AVX512/MathFunctions.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index a9bbfe276..c74340b81 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -183,6 +183,9 @@ #ifdef __AVX512DQ__ #define EIGEN_VECTORIZE_AVX512DQ #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif #endif // include files diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 4695fbc81..61434a33e 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -375,7 +375,7 @@ prsqrt(const Packet8d& _x) { // Insert NaNs and Infs in all the right places. return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); } -#else +#elif defined(EIGEN_VECTORIZE_AVX512ER) template <> EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { return _mm512_rsqrt28_ps(x); From 407e3e2621077a6cd768042c88c652ad75a085ae Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 15:59:30 +0200 Subject: [PATCH 274/680] bug #1532: disable stl::*_negate in C++17 (they are deprecated) --- Eigen/src/Core/functors/StlFunctors.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h index 6df3fa501..9c1d75850 100644 --- a/Eigen/src/Core/functors/StlFunctors.h +++ b/Eigen/src/Core/functors/StlFunctors.h @@ -83,13 +83,17 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif +#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910) +// std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +// std::binary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +#endif #ifdef EIGEN_STDEXT_SUPPORT From 8d0ffe36552aeeb5f46d9c652edc45c68e536cdd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 16:15:43 +0200 Subject: [PATCH 275/680] bug #1516: add assertion for out-of-range diagonal index in MatrixBase::diagonal(i) --- Eigen/src/Core/Diagonal.h | 5 ++++- test/diagonal.cpp | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index c62f5ff21..563135fb2 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -70,7 +70,10 @@ template class Diagonal EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DEVICE_FUNC - explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} + explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) + { + eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() ); + } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) diff --git a/test/diagonal.cpp b/test/diagonal.cpp index c1546e97d..0b5ae8280 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -66,6 +66,9 @@ template void diagonal(const MatrixType& m) m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1; VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1); } + + VERIFY( m1.diagonal( cols).size()==0 ); + VERIFY( m1.diagonal(-rows).size()==0 ); } template void diagonal_assert(const MatrixType& m) { @@ -81,6 +84,9 @@ template void diagonal_assert(const MatrixType& m) { VERIFY_RAISES_ASSERT( m1.array() *= m1.diagonal().array() ); VERIFY_RAISES_ASSERT( m1.array() /= m1.diagonal().array() ); } + + VERIFY_RAISES_ASSERT( m1.diagonal(cols+1) ); + VERIFY_RAISES_ASSERT( m1.diagonal(-(rows+1)) ); } void test_diagonal() @@ -95,7 +101,6 @@ void test_diagonal() CALL_SUBTEST_2( diagonal(MatrixXcd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_1( diagonal(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_1( diagonal(Matrix(3, 4)) ); + CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } - - CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } From c5b56f1fb27fb5b85eefef6b93dd71f4edb400db Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 16:49:35 +0200 Subject: [PATCH 276/680] bug #1528: better use numeric_limits::min() instead of 1/highest() that with underflow. --- Eigen/src/Cholesky/LDLT.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index 5be58377b..2dfeac333 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -569,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); - // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon - // as motivated by LAPACK's xGELSS: + // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min()) + // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS: // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits::epsilon(),RealScalar(1) / NumTraits::highest()); // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. - RealScalar tolerance = RealScalar(1) / NumTraits::highest(); + // Using numeric_limits::min() gives us more robustness to denormals. + RealScalar tolerance = (std::numeric_limits::min)(); for (Index i = 0; i < vecD.size(); ++i) { From dd4cc6bd9e60d1ab2cef4c66ee922d3d84f3d0b2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 17:11:15 +0200 Subject: [PATCH 277/680] bug #1527: fix support for MKL's VML (destination was not properly resized) --- Eigen/src/Core/Assign_MKL.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 6c2ab9264..6866095bf 100755 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h @@ -84,7 +84,8 @@ class vml_assign_traits struct Assignment, SrcXprNested>, assign_op, \ Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseUnaryOp, SrcXprNested> SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ @@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseBinaryOp, SrcXprNested, \ const CwiseNullaryOp,Plain> > SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ VMLTYPE exponent = reinterpret_cast(src.rhs().functor().m_other); \ if(vml_assign_traits::Traversal==LinearTraversal) \ From 8c7b5158a10af19bb98f2c2b5c57d44fd8899fe8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 17:15:38 +0200 Subject: [PATCH 278/680] commit 45e9c9996da790b55ed9c4b0dfeae49492ac5c46 (HEAD -> memory_fix) Author: George Burgess IV Date: Thu Mar 1 11:20:24 2018 -0800 Prefer `::operator new` to `new` The C++ standard allows compilers much flexibility with `new` expressions, including eliding them entirely (https://godbolt.org/g/yS6i91). However, calls to `operator new` are required to be treated like opaque function calls. Since we're calling `new` for side-effects other than allocating heap memory, we should prefer the less flexible version. Signed-off-by: George Burgess IV --- Eigen/src/Core/util/Memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 006b0bfba..53300c388 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -70,7 +70,7 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); - new int[huge]; + ::operator new(huge); #endif } From a1292395d6feefa5d78ed9101c2c85648ba8eea9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 23:06:44 +0200 Subject: [PATCH 279/680] Fix compilation of product with inverse transpositions (e.g., mat * Transpositions().inverse()) --- Eigen/src/Core/Transpositions.h | 2 +- test/permutationmatrices.cpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 8798deca5..81a4a5855 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -384,7 +384,7 @@ class Transpose > const Product operator*(const MatrixBase& matrix, const Transpose& trt) { - return Product(matrix.derived(), trt.derived()); + return Product(matrix.derived(), trt); } /** \returns the \a matrix with the inverse transpositions applied to the rows. diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp index db1266579..0ed7a4b2b 100644 --- a/test/permutationmatrices.cpp +++ b/test/permutationmatrices.cpp @@ -19,9 +19,11 @@ template void permutationmatrices(const MatrixType& m) enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime, Options = MatrixType::Options }; typedef PermutationMatrix LeftPermutationType; + typedef Transpositions LeftTranspositionsType; typedef Matrix LeftPermutationVectorType; typedef Map MapLeftPerm; typedef PermutationMatrix RightPermutationType; + typedef Transpositions RightTranspositionsType; typedef Matrix RightPermutationVectorType; typedef Map MapRightPerm; @@ -35,6 +37,8 @@ template void permutationmatrices(const MatrixType& m) RightPermutationVectorType rv; randomPermutationVector(rv, cols); RightPermutationType rp(rv); + LeftTranspositionsType lt(lv); + RightTranspositionsType rt(rv); MatrixType m_permuted = MatrixType::Random(rows,cols); VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original" @@ -115,6 +119,14 @@ template void permutationmatrices(const MatrixType& m) Matrix B = rp.transpose(); VERIFY_IS_APPROX(A, B.transpose()); } + + m_permuted = m_original; + lp = lt; + rp = rt; + VERIFY_EVALUATION_COUNT(m_permuted = lt * m_permuted * rt, 1); + VERIFY_IS_APPROX(m_permuted, lp*m_original*rp); + + VERIFY_IS_APPROX(lt.inverse()*m_permuted*rt.inverse(), m_original); } template From 112c8993049ea0c4ef8e3e4f257880626f8b39e1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Apr 2018 23:16:43 +0200 Subject: [PATCH 280/680] comment unreachable code --- Eigen/src/Core/arch/AVX/PacketMath.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index af31023f0..34b89c790 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) { __m256d tmp = _mm256_shuffle_pd(a,a,5); return _mm256_permute2f128_pd(tmp, tmp, 1); - + #if 0 + // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd + // exhibit the same latency/throughput, but it is here for future reference/benchmarking... __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); return _mm256_permute_pd(swap_halves,5); + #endif } // pabs should be ok From e91e314347c14774206307a91d1b427e49f9b3e2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 11:39:19 +0200 Subject: [PATCH 281/680] bug #1494: makes pmin/pmax behave on Altivec/VSX as on x86 regading NaNs --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 36 +++++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 31bb896ca..ad0c6df45 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -388,10 +388,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + Packet4f ret; + __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_min(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + Packet4f ret; + __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_max(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } @@ -910,9 +928,19 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) +{ + Packet2d ret; + __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) +{ + Packet2d ret; + __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; +} template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } From 13f5df9f6763f2dc900b74df06a3b1e67bacdaaf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 13:10:38 +0200 Subject: [PATCH 282/680] Add a note on vec_min vs asm --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index ad0c6df45..6d7190a56 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -391,6 +391,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -403,6 +404,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -930,6 +932,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet2d ret; __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -937,6 +940,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet2d ret; __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; From 73729025a4740d344bae98dfbae36a69405e217f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 13:45:34 +0200 Subject: [PATCH 283/680] bug #1521: add unit test dedicated to numbest::hypos --- test/stable_norm.cpp | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index 3c02474b8..e4b97a6da 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -190,13 +190,52 @@ template void stable_norm(const MatrixType& m) } } +template +void test_hypot() +{ + typedef typename NumTraits::Real RealScalar; + Scalar factor = internal::random(); + while(numext::abs2(factor)(); + Scalar big = factor * ((std::numeric_limits::max)() * RealScalar(1e-4)); + + factor = internal::random(); + while(numext::abs2(factor)(); + Scalar small = factor * ((std::numeric_limits::min)() * RealScalar(1e4)); + + std::cout << big << " " << small << "\n"; + + Scalar one (1), + zero (0), + sqrt2 (std::sqrt(2)), + nan (std::numeric_limits::quiet_NaN()); + + Scalar a = internal::random(-1,1); + Scalar b = internal::random(-1,1); + VERIFY_IS_APPROX(numext::hypot(a,b),std::sqrt(numext::abs2(a)+numext::abs2(b))); + VERIFY_IS_EQUAL(numext::hypot(zero,zero), zero); + VERIFY_IS_APPROX(numext::hypot(one, one), sqrt2); + VERIFY_IS_APPROX(numext::hypot(big,big), sqrt2*numext::abs(big)); + VERIFY_IS_APPROX(numext::hypot(small,small), sqrt2*numext::abs(small)); + VERIFY_IS_APPROX(numext::hypot(small,big), numext::abs(big)); + VERIFY((numext::isnan)(numext::hypot(nan,a))); + VERIFY((numext::isnan)(numext::hypot(a,nan))); +} + void test_stable_norm() { for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_3( test_hypot() ); + CALL_SUBTEST_4( test_hypot() ); + CALL_SUBTEST_5( test_hypot >() ); + CALL_SUBTEST_6( test_hypot >() ); + CALL_SUBTEST_1( stable_norm(Matrix()) ); CALL_SUBTEST_2( stable_norm(Vector4d()) ); CALL_SUBTEST_3( stable_norm(VectorXd(internal::random(10,2000))) ); CALL_SUBTEST_4( stable_norm(VectorXf(internal::random(10,2000))) ); CALL_SUBTEST_5( stable_norm(VectorXcd(internal::random(10,2000))) ); + CALL_SUBTEST_6( stable_norm(VectorXcf(internal::random(10,2000))) ); } } From e116f6847e3cd9d16ea3be14c2b7efbd0a1c2b0b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 13:47:23 +0200 Subject: [PATCH 284/680] bug #1521: avoid signalling NaN in hypot and make it std::complex<> friendly. --- Eigen/src/Core/MathFunctions.h | 26 +------------------------- Eigen/src/Core/MathFunctionsImpl.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e981129b2..84f9d0cd5 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -347,31 +347,7 @@ struct norm1_retval * Implementation of hypot * ****************************************************************************/ -template -struct hypot_impl -{ - typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) - { - EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); - RealScalar _x = abs(x); - RealScalar _y = abs(y); - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - if(p==RealScalar(0)) return RealScalar(0); - return p * sqrt(RealScalar(1) + qp*qp); - } -}; +template struct hypot_impl; template struct hypot_retval diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index ae1386b4c..034cfad7b 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -66,6 +66,25 @@ T generic_fast_tanh_float(const T& a_x) return pdiv(p, q); } + +template +struct hypot_impl +{ + typedef typename NumTraits::Real RealScalar; + static inline RealScalar run(const Scalar& x, const Scalar& y) + { + EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD_MATH(sqrt); + RealScalar _x = abs(x); + RealScalar _y = abs(y); + RealScalar p, qp; + p = numext::maxi(_x,_y); + if(p==RealScalar(0)) return RealScalar(0); + qp = numext::mini(_y,_x) / p; + return p * sqrt(RealScalar(1) + qp*qp); + } +}; + } // end namespace internal } // end namespace Eigen From 368dd4cd9d18cf904b2ce3c4de18020269dcbba1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 15:09:21 +0200 Subject: [PATCH 285/680] Make innerVector() and innerVectors() methods available to all expressions supported by Block. Before, only SparseBase exposed such methods. --- Eigen/src/SparseCore/SparseBlock.h | 40 ---------------------- Eigen/src/SparseCore/SparseMatrixBase.h | 12 ------- Eigen/src/plugins/BlockMethods.h | 44 +++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 52 deletions(-) diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 511e92b2f..5ed7c437b 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -326,46 +326,6 @@ private: //---------- -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorReturnType SparseMatrixBase::innerVector(Index outer) -{ return InnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorReturnType SparseMatrixBase::innerVector(Index outer) const -{ return ConstInnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - /** Generic implementation of sparse Block expression. * Real-only. */ diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index c6b548f11..dac2d8c62 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -350,18 +350,6 @@ template class SparseMatrixBase const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); } - // inner-vector - typedef Block InnerVectorReturnType; - typedef Block ConstInnerVectorReturnType; - InnerVectorReturnType innerVector(Index outer); - const ConstInnerVectorReturnType innerVector(Index outer) const; - - // set of inner-vectors - typedef Block InnerVectorsReturnType; - typedef Block ConstInnerVectorsReturnType; - InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize); - const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const; - DenseMatrixType toDense() const { return DenseMatrixType(derived()); diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 5caf14469..5fcd13cc4 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -40,6 +40,14 @@ typedef const VectorBlock ConstSegmentReturnType; template struct FixedSegmentReturnType { typedef VectorBlock Type; }; template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; +/// \internal inner-vector +typedef Block InnerVectorReturnType; +typedef Block ConstInnerVectorReturnType; + +/// \internal set of inner-vectors +typedef Block InnerVectorsReturnType; +typedef Block ConstInnerVectorsReturnType; + #endif // not EIGEN_PARSED_BY_DOXYGEN /// \returns an expression of a block in \c *this with either dynamic or fixed sizes. @@ -1354,3 +1362,39 @@ inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +InnerVectorReturnType innerVector(Index outer) +{ return InnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +const ConstInnerVectorReturnType innerVector(Index outer) const +{ return ConstInnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +InnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +const ConstInnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) const +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} \ No newline at end of file From 4213b63f5ce33d3f904674ee7b0cabd6934dda6b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 15:12:43 +0200 Subject: [PATCH 286/680] Factories code between numext::hypot and scalar_hyot_op functor. --- Eigen/src/Core/MathFunctionsImpl.h | 20 +++++++++++-------- Eigen/src/Core/functors/BinaryFunctors.h | 25 +++++++++--------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 034cfad7b..cc2ac0700 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -66,6 +66,17 @@ T generic_fast_tanh_float(const T& a_x) return pdiv(p, q); } +template +EIGEN_STRONG_INLINE +RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) +{ + EIGEN_USING_STD_MATH(sqrt); + RealScalar p, qp; + p = numext::maxi(x,y); + if(p==RealScalar(0)) return RealScalar(0); + qp = numext::mini(y,x) / p; + return p * sqrt(RealScalar(1) + qp*qp); +} template struct hypot_impl @@ -74,14 +85,7 @@ struct hypot_impl static inline RealScalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); - RealScalar _x = abs(x); - RealScalar _y = abs(y); - RealScalar p, qp; - p = numext::maxi(_x,_y); - if(p==RealScalar(0)) return RealScalar(0); - qp = numext::mini(_y,_x) / p; - return p * sqrt(RealScalar(1) + qp*qp); + return positive_real_hypot(abs(x), abs(y)); } }; diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 96747bac7..3eae6b8ca 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -255,7 +255,7 @@ struct scalar_cmp_op : binary_op_base struct scalar_hypot_op : binary_op_base { EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) -// typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const { - EIGEN_USING_STD_MATH(sqrt) - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - return p * sqrt(Scalar(1) + qp*qp); + // This functor is used by hypotNorm only for which it is faster to first apply abs + // on all coefficients prior to reduction through hypot. + // This way we avoid calling abs on positive and real entries, and this also permits + // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes + // through the same functor... + return internal::positive_real_hypot(x,y); } }; template From 403f09ccef73d347e7d1a6966dfb04eb291cc8b5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 15:13:31 +0200 Subject: [PATCH 287/680] Make stableNorm and blueNorm compatible with 2D matrices. --- Eigen/src/Core/StableNorm.h | 115 +++++++++++++++++++++++++----------- test/stable_norm.cpp | 3 +- 2 files changed, 80 insertions(+), 38 deletions(-) diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index e0e7a0c8f..77ea3c261 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc ssq += (bl*invScale).squaredNorm(); } +template +void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) +{ + typedef typename VectorType::Scalar Scalar; + const Index blockSize = 4096; + + typedef typename internal::nested_eval::type VectorTypeCopy; + typedef typename internal::remove_all::type VectorTypeCopyClean; + const VectorTypeCopy copy(vec); + + enum { + CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization + }; + typedef typename internal::conditional, internal::evaluator::Alignment>, + typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + Index n = vec.size(); + + Index bi = internal::first_default_aligned(copy); + if (bi>0) + internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); + for (; bi +typename VectorType::RealScalar +stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) +{ + using std::sqrt; + using std::abs; + + Index n = vec.size(); + + if(n==1) + return abs(vec.coeff(0)); + + typedef typename VectorType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + stable_norm_impl_inner_step(vec, ssq, scale, invScale); + + return scale * sqrt(ssq); +} + +template +typename MatrixType::RealScalar +stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) +{ + using std::sqrt; + + typedef typename MatrixType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + for(Index j=0; j inline typename NumTraits::Scalar>::Real blueNorm_impl(const EigenBase& _vec) @@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase& _vec) RealScalar asml = RealScalar(0); RealScalar amed = RealScalar(0); RealScalar abig = RealScalar(0); - for(typename Derived::InnerIterator it(vec, 0); it; ++it) + + for(Index j=0; j ab2) abig += numext::abs2(ax*s2m); - else if(ax < b1) asml += numext::abs2(ax*s1m); - else amed += numext::abs2(ax); + for(typename Derived::InnerIterator it(vec, j); it; ++it) + { + RealScalar ax = abs(it.value()); + if(ax > ab2) abig += numext::abs2(ax*s2m); + else if(ax < b1) asml += numext::abs2(ax*s1m); + else amed += numext::abs2(ax); + } } if(amed!=amed) return amed; // we got a NaN @@ -156,36 +225,7 @@ template inline typename NumTraits::Scalar>::Real MatrixBase::stableNorm() const { - using std::sqrt; - using std::abs; - const Index blockSize = 4096; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of square - - typedef typename internal::nested_eval::type DerivedCopy; - typedef typename internal::remove_all::type DerivedCopyClean; - const DerivedCopy copy(derived()); - - enum { - CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) - || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization - }; - typedef typename internal::conditional, internal::evaluator::Alignment>, - typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; - Index n = size(); - - if(n==1) - return abs(this->coeff(0)); - - Index bi = internal::first_default_aligned(copy); - if (bi>0) - internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); - for (; bi inline typename NumTraits::Scalar>::Real MatrixBase::hypotNorm() const { - return this->cwiseAbs().redux(internal::scalar_hypot_op()); + if(size()==1) + return numext::abs(coeff(0,0)); + else + return this->cwiseAbs().redux(internal::scalar_hypot_op()); } } // end namespace Eigen diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index e4b97a6da..0dcf072fe 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp @@ -204,8 +204,6 @@ void test_hypot() factor = internal::random(); Scalar small = factor * ((std::numeric_limits::min)() * RealScalar(1e4)); - std::cout << big << " " << small << "\n"; - Scalar one (1), zero (0), sqrt2 (std::sqrt(2)), @@ -234,6 +232,7 @@ void test_stable_norm() CALL_SUBTEST_1( stable_norm(Matrix()) ); CALL_SUBTEST_2( stable_norm(Vector4d()) ); CALL_SUBTEST_3( stable_norm(VectorXd(internal::random(10,2000))) ); + CALL_SUBTEST_3( stable_norm(MatrixXd(internal::random(10,200), internal::random(10,200))) ); CALL_SUBTEST_4( stable_norm(VectorXf(internal::random(10,2000))) ); CALL_SUBTEST_5( stable_norm(VectorXcd(internal::random(10,2000))) ); CALL_SUBTEST_6( stable_norm(VectorXcf(internal::random(10,2000))) ); From b903fa74fdaad01dbf87233d03deb3b69624ae5b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 15:14:09 +0200 Subject: [PATCH 288/680] Extend list of MSVC versions --- Eigen/src/Core/util/Macros.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 5872ade26..9c68ecb7d 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -73,12 +73,17 @@ // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: // name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 -// 2015 14 1900 -// "15" 15 1900 +// 2008 9 1500 +// 2010 10 1600 +// 2012 11 1700 +// 2013 12 1800 +// 2015 14 1900 +// "15" 15 1900 +// 2017-14.1 15.0 1910 +// 2017-14.11 15.3 1911 +// 2017-14.12 15.5 1912 +// 2017-14.13 15.6 1913 +// 2017-14.14 15.7 1914 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) From 2f833b1c6438fb379e05f12e3111c30c0b2f3416 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 4 Apr 2018 15:47:46 +0200 Subject: [PATCH 289/680] bug #1509: fix computeInverseWithCheck for complexes --- Eigen/src/LU/InverseImpl.h | 2 +- test/inverse.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index 018f99b58..f49f23360 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -404,7 +404,7 @@ inline void MatrixBase::computeInverseWithCheck( const RealScalar& absDeterminantThreshold ) const { - RealScalar determinant; + Scalar determinant; // i'd love to put some static assertions there, but SFINAE means that they have no effect... eigen_assert(rows() == cols()); computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold); diff --git a/test/inverse.cpp b/test/inverse.cpp index 5c6777a18..97fe6ff92 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -47,7 +47,7 @@ template void inverse(const MatrixType& m) //computeInverseAndDetWithCheck tests //First: an invertible matrix bool invertible; - RealScalar det; + Scalar det; m2.setZero(); m1.computeInverseAndDetWithCheck(m2, det, invertible); @@ -113,5 +113,7 @@ void test_inverse() CALL_SUBTEST_7( inverse(Matrix4d()) ); CALL_SUBTEST_7( inverse(Matrix()) ); + + CALL_SUBTEST_8( inverse(Matrix4cd()) ); } } From c2624c03189562208a8e839b8a60291b763c8f44 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Apr 2018 08:45:19 +0200 Subject: [PATCH 290/680] Fix cmake scripts with no fortran compiler --- cmake/language_support.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake index 2f14f30b8..ddba50945 100644 --- a/cmake/language_support.cmake +++ b/cmake/language_support.cmake @@ -26,7 +26,7 @@ function(workaround_9220 language language_works) cmake_minimum_required(VERSION 2.8.0) set (CMAKE_Fortran_FLAGS \"${CMAKE_Fortran_FLAGS}\") set (CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS}\") - enable_language(${language} OPTIONAL) + enable_language(${language}) ") file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/language_tests/${language}) file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}) From 04b1628e5546732785419560e94b77c1299f4f28 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Apr 2018 13:28:31 +0200 Subject: [PATCH 291/680] Add missing empty line. --- Eigen/src/plugins/BlockMethods.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 5fcd13cc4..0a53db19f 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -1397,4 +1397,4 @@ innerVectors(Index outerStart, Index outerSize) const IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); -} \ No newline at end of file +} From add15924ac7a358001a02c94295d17a1623444cf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Apr 2018 13:29:26 +0200 Subject: [PATCH 292/680] Fix MKL backend for symmetric eigenvalues on row-major matrices. --- .../SelfAdjointEigenSolver_LAPACKE.h | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h index 3891cf883..b0c947dc0 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h @@ -37,7 +37,7 @@ namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \ +#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \ template<> template inline \ SelfAdjointEigenSolver >& \ SelfAdjointEigenSolver >::compute(const EigenBase& matrix, int options) \ @@ -47,7 +47,7 @@ SelfAdjointEigenSolver >::compute(c && (options&EigVecMask)!=EigVecMask \ && "invalid option parameter"); \ bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \ - lapack_int n = internal::convert_index(matrix.cols()), lda, matrix_order, info; \ + lapack_int n = internal::convert_index(matrix.cols()), lda, info; \ m_eivalues.resize(n,1); \ m_subdiag.resize(n-1); \ m_eivec = matrix; \ @@ -63,27 +63,24 @@ SelfAdjointEigenSolver >::compute(c } \ \ lda = internal::convert_index(m_eivec.outerStride()); \ - matrix_order=LAPACKE_COLROW; \ char jobz, uplo='L'/*, range='A'*/; \ jobz = computeEigenvectors ? 'V' : 'N'; \ \ - info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ + info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ m_info = (info==0) ? Success : NoConvergence; \ m_isInitialized = true; \ m_eigenvectorsOk = computeEigenvectors; \ return *this; \ } +#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, ColMajor, LAPACK_COL_MAJOR) - -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, RowMajor, LAPACK_ROW_MAJOR) +EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev) +EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev) +EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev) +EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev) } // end namespace Eigen From 08008f67e1c8b3eb5fa6e846f2760aae38cf82c7 Mon Sep 17 00:00:00 2001 From: vhuber Date: Mon, 9 Apr 2018 17:07:46 +0200 Subject: [PATCH 293/680] Add unitTest --- test/umfpack_support.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp index 37ab11f0b..3e9b33ba4 100644 --- a/test/umfpack_support.cpp +++ b/test/umfpack_support.cpp @@ -12,10 +12,10 @@ #include -template void test_umfpack_support_T() +template void test_umfpack_support_T() { - UmfPackLU > umfpack_colmajor; - UmfPackLU > umfpack_rowmajor; + UmfPackLU > umfpack_colmajor; + UmfPackLU > umfpack_rowmajor; check_sparse_square_solving(umfpack_colmajor); check_sparse_square_solving(umfpack_rowmajor); @@ -26,7 +26,9 @@ template void test_umfpack_support_T() void test_umfpack_support() { - CALL_SUBTEST_1(test_umfpack_support_T()); - CALL_SUBTEST_2(test_umfpack_support_T >()); + CALL_SUBTEST_1((test_umfpack_support_T())); + CALL_SUBTEST_2((test_umfpack_support_T, int >())); + CALL_SUBTEST_3((test_umfpack_support_T())); + CALL_SUBTEST_4((test_umfpack_support_T, UF_long >())); } From 8c1652055aa65a0e7bbad1834fd603c8574ebaa2 Mon Sep 17 00:00:00 2001 From: Guillaume Jacob Date: Mon, 9 Apr 2018 17:23:59 +0200 Subject: [PATCH 294/680] Fix code sample output in block(int, int, int, int) doxygen --- Eigen/src/plugins/BlockMethods.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 0a53db19f..a5748525b 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -1044,7 +1044,7 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow /// \a NRows is \a Dynamic, and the same for the number of columns. /// /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp +/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out /// /// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic /// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence: From c91906b065ddfd80997204e3072bb66bc9297bcd Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Apr 2018 09:59:59 +0200 Subject: [PATCH 295/680] Umfpack: UF_long has been removed in recent versions of suitesparse, and fix a few long-to-int conversions issues. --- Eigen/src/UmfPackSupport/UmfPackSupport.h | 99 +++++++++++++---------- test/umfpack_support.cpp | 6 +- 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 4de7ccf4c..ba10a9318 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -10,6 +10,16 @@ #ifndef EIGEN_UMFPACKSUPPORT_H #define EIGEN_UMFPACKSUPPORT_H +// for compatibility with super old version of umfpack, +// not sure this is really needed, but this is harmless. +#ifndef SuiteSparse_long +#ifdef UF_long +#define SuiteSparse_long UF_long +#else +#error neither SuiteSparse_long nor UF_long are defined +#endif +#endif + namespace Eigen { /* TODO extract L, extract U, compute det, etc... */ @@ -24,10 +34,10 @@ inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, UF_long) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long) { umfpack_dl_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, UF_long) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) { umfpack_zl_defaults(control); } // Report info @@ -37,10 +47,10 @@ inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMF inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, int) { umfpack_zi_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, UF_long) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long) { umfpack_dl_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, UF_long) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, SuiteSparse_long) { umfpack_zl_report_info(control, info);} // Report status @@ -50,10 +60,10 @@ inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, d inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, int) { umfpack_zi_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, UF_long) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long) { umfpack_dl_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, UF_long) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, SuiteSparse_long) { umfpack_zl_report_status(control, status);} // report control @@ -63,10 +73,10 @@ inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, UF_long) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long) { umfpack_dl_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, UF_long) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) { umfpack_zl_report_control(control);} // Free numeric @@ -76,10 +86,10 @@ inline void umfpack_free_numeric(void **Numeric, double, int) inline void umfpack_free_numeric(void **Numeric, std::complex, int) { umfpack_zi_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, double, UF_long) +inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long) { umfpack_dl_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, std::complex, UF_long) +inline void umfpack_free_numeric(void **Numeric, std::complex, SuiteSparse_long) { umfpack_zl_free_numeric(Numeric); *Numeric = 0; } // Free symbolic @@ -89,10 +99,10 @@ inline void umfpack_free_symbolic(void **Symbolic, double, int) inline void umfpack_free_symbolic(void **Symbolic, std::complex, int) { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, double, UF_long) +inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long) { umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, std::complex, UF_long) +inline void umfpack_free_symbolic(void **Symbolic, std::complex, SuiteSparse_long) { umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; } // Symbolic @@ -109,16 +119,16 @@ inline int umfpack_symbolic(int n_row,int n_col, { return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } -inline int umfpack_symbolic(UF_long n_row,UF_long n_col, - const UF_long Ap[], const UF_long Ai[], const double Ax[], void **Symbolic, - const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) { return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info); } -inline int umfpack_symbolic(UF_long n_row,UF_long n_col, - const UF_long Ap[], const UF_long Ai[], const std::complex Ax[], void **Symbolic, - const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) { return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } @@ -137,16 +147,16 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex Ax[], - void *Symbolic, void **Numeric, - const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) +inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], + void *Symbolic, void **Numeric, + const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) { return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info); } @@ -166,16 +176,16 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } -inline int umfpack_solve( int sys, const UF_long Ap[], const UF_long Ai[], const double Ax[], - double X[], const double B[], void *Numeric, - const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], + double X[], const double B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) { return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info); } -inline int umfpack_solve( int sys, const UF_long Ap[], const UF_long Ai[], const std::complex Ax[], - std::complex X[], const std::complex B[], void *Numeric, - const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], + std::complex X[], const std::complex B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) { return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } @@ -191,12 +201,14 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_ return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } -inline int umfpack_get_lunz(UF_long *lnz, UF_long *unz, UF_long *n_row, UF_long *n_col, UF_long *nz_udiag, void *Numeric, double) +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, double) { return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } -inline int umfpack_get_lunz(UF_long *lnz, UF_long *unz, UF_long *n_row, UF_long *n_col, UF_long *nz_udiag, void *Numeric, std::complex) +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, std::complex) { return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } @@ -217,14 +229,14 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex Lx[], in return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, Dx?&dx0_real:0,0,do_recip,Rs,Numeric); } -inline int umfpack_get_numeric(UF_long Lp[], UF_long Lj[], double Lx[], UF_long Up[], UF_long Ui[], double Ux[], - UF_long P[], UF_long Q[], double Dx[], UF_long *do_recip, double Rs[], void *Numeric) +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) { return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric); } -inline int umfpack_get_numeric(UF_long Lp[], UF_long Lj[], std::complex Lx[], UF_long Up[], UF_long Ui[], std::complex Ux[], - UF_long P[], UF_long Q[], std::complex Dx[], UF_long *do_recip, double Rs[], void *Numeric) +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], std::complex Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) { double& lx0_real = numext::real_ref(Lx[0]); double& ux0_real = numext::real_ref(Ux[0]); @@ -245,12 +257,12 @@ inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *N return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], UF_long) +inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) { return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], UF_long) +inline SuiteSparse_long umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) { double& mx_real = numext::real_ref(*Mx); return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); @@ -532,7 +544,7 @@ class UmfPackLU : public SparseSolverBase > // cached data to reduce reallocation, etc. mutable LUMatrixType m_l; - int m_fact_errorCode; + StorageIndex m_fact_errorCode; UmfpackControl m_control; mutable UmfpackInfo m_umfpackInfo; @@ -601,7 +613,7 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet"); eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); - int errorCode; + StorageIndex errorCode; Scalar* x_ptr = 0; Matrix x_tmp; if(x.innerStride()!=1) @@ -613,9 +625,10 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas { if(x.innerStride()==1) x_ptr = &x.col(j).coeffRef(0); - errorCode = umfpack_solve(UMFPACK_A, - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data()); + errorCode = umfpack_solve(UMFPACK_A, + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), + m_numeric, m_control.data(), m_umfpackInfo.data()); if(x.innerStride()!=1) x.col(j) = x_tmp; if (errorCode!=0) diff --git a/test/umfpack_support.cpp b/test/umfpack_support.cpp index 3e9b33ba4..0378801f4 100644 --- a/test/umfpack_support.cpp +++ b/test/umfpack_support.cpp @@ -27,8 +27,8 @@ template void test_umfpack_support_T() void test_umfpack_support() { CALL_SUBTEST_1((test_umfpack_support_T())); - CALL_SUBTEST_2((test_umfpack_support_T, int >())); - CALL_SUBTEST_3((test_umfpack_support_T())); - CALL_SUBTEST_4((test_umfpack_support_T, UF_long >())); + CALL_SUBTEST_2((test_umfpack_support_T, int>())); + CALL_SUBTEST_3((test_umfpack_support_T())); + CALL_SUBTEST_4((test_umfpack_support_T, long>())); } From e798466871ceef80a5bd78eba460735fca829a8c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Apr 2018 10:46:11 +0200 Subject: [PATCH 296/680] bug #1538: update manual pages regarding BDCSVD. --- doc/LeastSquares.dox | 2 +- doc/TopicLinearAlgebraDecompositions.dox | 14 +++++++++++- doc/TutorialLinearAlgebra.dox | 28 ++++++++++++++++++++---- doc/examples/TutorialLinAlgSVDSolve.cpp | 2 +- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox index e2191a22f..24dfe4b4f 100644 --- a/doc/LeastSquares.dox +++ b/doc/LeastSquares.dox @@ -16,7 +16,7 @@ equations is the fastest but least accurate, and the QR decomposition is in betw \section LeastSquaresSVD Using the SVD decomposition -The \link JacobiSVD::solve() solve() \endlink method in the JacobiSVD class can be directly used to +The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to solve linear squares systems. It is not enough to compute only the singular values (the default for this class); you also need the singular vectors but the thin SVD decomposition suffices for computing least squares solutions: diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 991f964cc..0965da872 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -4,7 +4,7 @@ namespace Eigen { This page presents a catalogue of the dense matrix decompositions offered by Eigen. For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink. -To get an overview of the true relative speed of the different decomposition, check this \link DenseDecompositionBenchmark benchmark \endlink. +To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink. \section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen @@ -113,6 +113,18 @@ To get an overview of the true relative speed of the different decomposition, ch
\n Singular values and eigenvalues decompositions
BDCSVD (divide \& conquer)-One of the fastest SVD algorithmsExcellentYesSingular values/vectors, least squaresYes (and does least squares)ExcellentBlocked bidiagonalization
JacobiSVD (two-sided) -ColPivHouseholderQR colPivHouseholderQr() None+++ - +++
- - +++
CompleteOrthogonalDecompositioncompleteOrthogonalDecomposition()None+-+++
LLT llt()+ ++
BDCSVDbdcSvd()None--+++
JacobiSVD jacobiSvd() None- -- - - - +++
+To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink. All of these decompositions offer a solve() method that works as in the above example. @@ -183,8 +200,11 @@ Here is an example: \section TutorialLinAlgLeastsquares Least squares solving -The most accurate method to do least squares solving is with a SVD decomposition. Eigen provides one -as the JacobiSVD class, and its solve() is doing least-squares solving. +The most accurate method to do least squares solving is with a SVD decomposition. +Eigen provides two implementations. +The recommended one is the BDCSVD class, which scale well for large problems +and automatically fall-back to the JacobiSVD class for smaller problems. +For both classes, their solve() method is doing least-squares solving. Here is an example: diff --git a/doc/examples/TutorialLinAlgSVDSolve.cpp b/doc/examples/TutorialLinAlgSVDSolve.cpp index 9fbc031de..f109f04e5 100644 --- a/doc/examples/TutorialLinAlgSVDSolve.cpp +++ b/doc/examples/TutorialLinAlgSVDSolve.cpp @@ -11,5 +11,5 @@ int main() VectorXf b = VectorXf::Random(3); cout << "Here is the right hand side b:\n" << b << endl; cout << "The least-squares solution is:\n" - << A.jacobiSvd(ComputeThinU | ComputeThinV).solve(b) << endl; + << A.bdcSvd(ComputeThinU | ComputeThinV).solve(b) << endl; } From 9c8decffbf13604d419d5ea530169cc91afcd45a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Apr 2018 11:30:14 +0200 Subject: [PATCH 297/680] Fix javascript hacks for oxygen 1.8.13 --- doc/eigen_navtree_hacks.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js index bd7e02b38..39c59f73c 100644 --- a/doc/eigen_navtree_hacks.js +++ b/doc/eigen_navtree_hacks.js @@ -65,6 +65,10 @@ function getNode(o, po) function resizeHeight() { var toc = $("#nav-toc"); + var header = $("#header"); + var content = $("#doc-content"); + var navtree = $("#nav-path"); + var sidenav = $("#side-nav"); var tocHeight = toc.height(); // <- we added this line var headerHeight = header.height(); var footerHeight = footer.height(); From 426052ef6e10714f6eb1179eee405bfa77ce0cc0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Apr 2018 11:30:34 +0200 Subject: [PATCH 298/680] Update header/footer for doxygen 1.8.13 --- doc/eigendoxy_footer.html.in | 4 ++-- doc/eigendoxy_header.html.in | 18 ++++++++---------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/eigendoxy_footer.html.in b/doc/eigendoxy_footer.html.in index 878244a19..9ac0596cb 100644 --- a/doc/eigendoxy_footer.html.in +++ b/doc/eigendoxy_footer.html.in @@ -5,14 +5,14 @@ $navpath + doxygen $doxygenversion diff --git a/doc/eigendoxy_header.html.in b/doc/eigendoxy_header.html.in index 0f3859f40..bb149f8f0 100644 --- a/doc/eigendoxy_header.html.in +++ b/doc/eigendoxy_header.html.in @@ -4,25 +4,23 @@ + $projectname: $title $title - - - + + + $treeview $search $mathjax - + - -
-
@@ -30,10 +28,10 @@ $mathjax
- + - From 79266fec75a78a31126efbbfd603d8459660350d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Apr 2018 11:31:17 +0200 Subject: [PATCH 299/680] extend doxygen splitter for huge screens --- doc/eigendoxy.css | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css index b99d7914a..427b128ba 100644 --- a/doc/eigendoxy.css +++ b/doc/eigendoxy.css @@ -219,3 +219,8 @@ h3.version { td.width20em p.endtd { width: 20em; } + +/* needed for huge screens */ +.ui-resizable-e { + background-repeat: repeat-y; +} \ No newline at end of file From b0eda3cb9fc08d7b0bd39d99a1874b9575439e2c Mon Sep 17 00:00:00 2001 From: Weiming Zhao Date: Wed, 11 Apr 2018 11:37:06 +0200 Subject: [PATCH 300/680] Avoid using memcpy for non-POD elements --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 2264be391..fe62ff1ea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -55,7 +55,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { + if (!NumTraits::type>::RequireInitialization && dest) { m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); return false; } From e43ca0320d58587736d8bab7c6ed7212aa6b8d9d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 11 Apr 2018 15:24:13 +0200 Subject: [PATCH 301/680] bug #1520: workaround some -Wfloat-equal warnings by calling std::equal_to --- Eigen/src/Core/MathFunctions.h | 6 +++--- Eigen/src/Core/arch/CUDA/Half.h | 4 ++-- Eigen/src/Core/util/Meta.h | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 84f9d0cd5..ac1b7a6a1 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -473,11 +473,11 @@ namespace std_fallback { EIGEN_USING_STD_MATH(exp); Scalar u = exp(x); - if (u == Scalar(1)) { + if (numext::equal_strict(u, Scalar(1))) { return x; } Scalar um1 = u - RealScalar(1); - if (um1 == Scalar(-1)) { + if (numext::equal_strict(um1, Scalar(-1))) { return RealScalar(-1); } @@ -519,7 +519,7 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; - return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); } } diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 2bb3d0d9b..c10550050 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -299,10 +299,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) return a; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return float(a) == float(b); + return numext::equal_strict(float(a),float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return float(a) != float(b); + return numext::not_equal_strict(float(a), float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return float(a) < float(b); diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 998b8921a..e7729fdc8 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -543,6 +543,26 @@ T div_ceil(const T &a, const T &b) return (a+b-1) / b; } +// The aim of the following functions is to bypass -Wfloat-equal warnings +// when we really want a strict equality comparison on floating points. +template +bool equal_strict(const X& x,const Y& y) { return x == y; } + +template<> +bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } + +template<> +bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } + +template +bool not_equal_strict(const X& x,const Y& y) { return x != y; } + +template<> +bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } + +template<> +bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } + } // end namespace numext } // end namespace Eigen From 7a9089c33cdb7f9d6cea0380ca75a44969c3ef78 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 13 Apr 2018 08:51:47 +0200 Subject: [PATCH 302/680] fix linking issue --- Eigen/src/Core/util/Meta.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index e7729fdc8..1180f0c13 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -545,22 +545,22 @@ T div_ceil(const T &a, const T &b) // The aim of the following functions is to bypass -Wfloat-equal warnings // when we really want a strict equality comparison on floating points. -template +template EIGEN_STRONG_INLINE bool equal_strict(const X& x,const Y& y) { return x == y; } -template<> +template<> EIGEN_STRONG_INLINE bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } -template<> +template<> EIGEN_STRONG_INLINE bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } -template +template EIGEN_STRONG_INLINE bool not_equal_strict(const X& x,const Y& y) { return x != y; } -template<> +template<> EIGEN_STRONG_INLINE bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } -template<> +template<> EIGEN_STRONG_INLINE bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } } // end namespace numext From 072e111ec08be523f8bf5b4c742b5cbcb8ea0983 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 19:00:34 +0200 Subject: [PATCH 303/680] SelfAdjointView<...,Mode> causes a static assert since commit d820ab9edc0b38af4cdb3d545714a0c9083e5a78 --- Eigen/src/Core/SelfAdjointView.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index be219726b..2cf3fa1ef 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -191,7 +191,7 @@ template class SelfAdjointView TriangularView >::type(tmp2); } - typedef SelfAdjointView ConjugateReturnType; + typedef SelfAdjointView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const From 4d392d93aaf35447dff7b4522dcf587e49a44b8f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 19:01:37 +0200 Subject: [PATCH 304/680] Make hypot_impl compile again for types with expression-templates (e.g., boost::multiprecision) --- Eigen/src/Core/MathFunctionsImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index cc2ac0700..28737c15e 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -85,7 +85,7 @@ struct hypot_impl static inline RealScalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(abs); - return positive_real_hypot(abs(x), abs(y)); + return positive_real_hypot(abs(x), abs(y)); } }; From 84dcd998a92e4f90f963d2849252f4338adaf398 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 19:10:23 +0200 Subject: [PATCH 305/680] Recent Adolc versions require C++11 --- unsupported/test/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 22647cadd..42b790d55 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -41,11 +41,16 @@ else(GOOGLEHASH_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "GoogleHash, ") endif(GOOGLEHASH_FOUND) + find_package(Adolc) if(ADOLC_FOUND) include_directories(${ADOLC_INCLUDES}) ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ") - ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES}) + if(EIGEN_TEST_CXX11) + ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES}) + else() + message(STATUS "Adolc found, but tests require C++11 mode") + endif() else(ADOLC_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ") endif(ADOLC_FOUND) From 2cbb00b18ee3d02d3b747bcb1775d7f851e46d7e Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 19:14:25 +0200 Subject: [PATCH 306/680] No need to make noise, if KLU is found --- test/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8bcf3f7c5..f8deafc07 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -309,7 +309,6 @@ if(UMFPACK_FOUND) endif() if(KLU_FOUND OR SuiteSparse_FOUND) - message("ADDING KLU TEST") ei_add_test(klu_support "" "${KLU_ALL_LIBS}") endif() From c8b19702bc25b64b5ac9ab8f590a4706c3f46208 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 20:36:58 +0200 Subject: [PATCH 307/680] Limit test size for sparse Cholesky solvers to EIGEN_TEST_MAX_SIZE --- test/simplicial_cholesky.cpp | 4 ++-- test/sparse_solver.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/simplicial_cholesky.cpp b/test/simplicial_cholesky.cpp index 649c817b4..5bb1e0d6c 100644 --- a/test/simplicial_cholesky.cpp +++ b/test/simplicial_cholesky.cpp @@ -35,8 +35,8 @@ template void test_simplicial_cholesky_T() check_sparse_spd_determinant(ldlt_colmajor_lower_amd); check_sparse_spd_determinant(ldlt_colmajor_upper_amd); - check_sparse_spd_solving(ldlt_colmajor_lower_nat, 300, 1000); - check_sparse_spd_solving(ldlt_colmajor_upper_nat, 300, 1000); + check_sparse_spd_solving(ldlt_colmajor_lower_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000); + check_sparse_spd_solving(ldlt_colmajor_upper_nat, (std::min)(300,EIGEN_TEST_MAX_SIZE), 1000); } void test_simplicial_cholesky() diff --git a/test/sparse_solver.h b/test/sparse_solver.h index 5145bc3eb..19416ed5d 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -266,7 +266,7 @@ std::string solver_stats(const SparseSolverBase &/*solver*/) } #endif -template void check_sparse_spd_solving(Solver& solver, int maxSize = 300, int maxRealWorldSize = 100000) +template void check_sparse_spd_solving(Solver& solver, int maxSize = (std::min)(300,EIGEN_TEST_MAX_SIZE), int maxRealWorldSize = 100000) { typedef typename Solver::MatrixType Mat; typedef typename Mat::Scalar Scalar; From c9ecfff2e6ab3e68bc4569e05f3e29503e756da2 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 13 Apr 2018 21:05:28 +0000 Subject: [PATCH 308/680] Add links where to make PRs and report bugs into README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 4654a81c3..99c9e2933 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.** For more information go to http://eigen.tuxfamily.org/. + +For ***pull request*** please only use the official repository at https://bitbucket.org/eigen/eigen. + +For ***bug reports*** and ***feature requests*** go to http://eigen.tuxfamily.org/bz. From 42715533f1bff346d024edce4f79c5fdef7dba04 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sun, 15 Apr 2018 10:15:28 +0200 Subject: [PATCH 309/680] bug #1493: Make representation of HouseholderSequence consistent and working for complex numbers. Made corresponding unit test actually test that. Also simplify implementation of QR decompositions --- .../src/Eigenvalues/HessenbergDecomposition.h | 2 +- Eigen/src/Householder/Householder.h | 8 ++--- Eigen/src/Householder/HouseholderSequence.h | 36 +++++++++++++++---- Eigen/src/QR/ColPivHouseholderQR.h | 6 +--- .../src/QR/CompleteOrthogonalDecomposition.h | 7 ++-- Eigen/src/QR/HouseholderQR.h | 6 +--- Eigen/src/SVD/UpperBidiagonalization.h | 2 +- test/householder.cpp | 29 ++++++++++----- test/qr.cpp | 2 +- 9 files changed, 60 insertions(+), 38 deletions(-) diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h index f647f69b0..d947dac4e 100644 --- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -315,7 +315,7 @@ void HessenbergDecomposition::_compute(MatrixType& matA, CoeffVector // A = A H' matA.rightCols(remainingSize) - .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0)); + .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0)); } } diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index 80de2c305..a5f336d18 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -103,7 +103,7 @@ void MatrixBase::makeHouseholder( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->cols() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheRight() @@ -140,7 +140,7 @@ void MatrixBase::applyHouseholderOnTheLeft( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->rows() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheLeft() @@ -160,10 +160,10 @@ void MatrixBase::applyHouseholderOnTheRight( { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); - tmp.noalias() = right * essential.conjugate(); + tmp.noalias() = right * essential; tmp += this->col(0); this->col(0) -= tau * tmp; - right.noalias() -= tau * tmp * essential.transpose(); + right.noalias() -= tau * tmp * essential.adjoint(); } } diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 3ce0a693d..13030c664 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -140,6 +140,22 @@ template class HouseholderS Side > ConjugateReturnType; + typedef HouseholderSequence< + VectorsType, + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + CoeffsType>::type, + Side + > AdjointReturnType; + + typedef HouseholderSequence< + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + VectorsType>::type, + CoeffsType, + Side + > TransposeReturnType; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -206,9 +222,12 @@ template class HouseholderS } /** \brief %Transpose of the Householder sequence. */ - HouseholderSequence transpose() const + TransposeReturnType transpose() const { - return HouseholderSequence(*this).setTrans(!m_trans); + return TransposeReturnType(m_vectors.conjugate(), m_coeffs) + .setTrans(!m_trans) + .setLength(m_length) + .setShift(m_shift); } /** \brief Complex conjugate of the Householder sequence. */ @@ -221,13 +240,16 @@ template class HouseholderS } /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ - ConjugateReturnType adjoint() const + AdjointReturnType adjoint() const { - return conjugate().setTrans(!m_trans); + return AdjointReturnType(m_vectors, m_coeffs.conjugate()) + .setTrans(!m_trans) + .setLength(m_length) + .setShift(m_shift); } /** \brief Inverse of the Householder sequence (equals the adjoint). */ - ConjugateReturnType inverse() const { return adjoint(); } + AdjointReturnType inverse() const { return adjoint(); } /** \internal */ template inline void evalTo(DestType& dst) const @@ -273,10 +295,10 @@ template class HouseholderS Index cornerSize = rows() - k - m_shift; if(m_trans) dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data()); } } } diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index ed47b05e3..1faa3442e 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -595,11 +595,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs) - .setLength(nonzero_pivots) - .transpose() - ); + c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() ); m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) .template triangularView() diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 880becb25..03017a375 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -452,7 +452,7 @@ void CompleteOrthogonalDecomposition::computeInPlace() // Apply Z(k) to the first k rows of X_k m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) .applyHouseholderOnTheRight( - m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), + m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), &m_temp(0)); } if (k != rank - 1) { @@ -500,11 +500,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } // Compute c = Q^* * rhs - // Note that the matrix Q = H_0^* H_1^*... so its inverse is - // Q^* = (H_0 H_1 ...)^T typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft( - householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); + c.applyOnTheLeft(matrixQ().setLength(rank).adjoint()); // Solve T z = c(1:rank, :) dst.topRows(rank) = matrixT() diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 762b21c36..13c02685d 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -353,11 +353,7 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence( - m_qr.leftCols(rank), - m_hCoeffs.head(rank)).transpose() - ); + c.applyOnTheLeft(householderQ().setLength(rank).adjoint() ); m_qr.topLeftCorner(rank, rank) .template triangularView() diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 0526ac931..997defc47 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat, .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]); // apply householder transform to remaining part of mat on the left mat.bottomRightCorner(remainingRows-1, remainingCols) - .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData); + .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData); } } diff --git a/test/householder.cpp b/test/householder.cpp index c5f6b5e4f..78d44d48a 100644 --- a/test/householder.cpp +++ b/test/householder.cpp @@ -49,6 +49,17 @@ template void householder(const MatrixType& m) v1.applyHouseholderOnTheLeft(essential,beta,tmp); VERIFY_IS_APPROX(v1.norm(), v2.norm()); + // reconstruct householder matrix: + SquareMatrixType id, H1, H2; + id.setIdentity(rows, rows); + H1 = H2 = id; + VectorType vv(rows); + vv << Scalar(1), essential; + H1.applyHouseholderOnTheLeft(essential, beta, tmp); + H2.applyHouseholderOnTheRight(essential, beta, tmp); + VERIFY_IS_APPROX(H1, H2); + VERIFY_IS_APPROX(H1, id - beta * vv*vv.adjoint()); + MatrixType m1(rows, cols), m2(rows, cols); @@ -69,7 +80,7 @@ template void householder(const MatrixType& m) m3.rowwise() = v1.transpose(); m4 = m3; m3.row(0).makeHouseholder(essential, beta, alpha); - m3.applyHouseholderOnTheRight(essential,beta,tmp); + m3.applyHouseholderOnTheRight(essential.conjugate(),beta,tmp); VERIFY_IS_APPROX(m3.norm(), m4.norm()); if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(m3.block(0,1,rows,rows-1).norm(), m3.norm()); VERIFY_IS_MUCH_SMALLER_THAN(numext::imag(m3(0,0)), numext::real(m3(0,0))); @@ -104,14 +115,14 @@ template void householder(const MatrixType& m) VERIFY_IS_APPROX(hseq_mat.adjoint(), hseq_mat_adj); VERIFY_IS_APPROX(hseq_mat.conjugate(), hseq_mat_conj); VERIFY_IS_APPROX(hseq_mat.transpose(), hseq_mat_trans); - VERIFY_IS_APPROX(hseq_mat * m6, hseq_mat * m6); - VERIFY_IS_APPROX(hseq_mat.adjoint() * m6, hseq_mat_adj * m6); - VERIFY_IS_APPROX(hseq_mat.conjugate() * m6, hseq_mat_conj * m6); - VERIFY_IS_APPROX(hseq_mat.transpose() * m6, hseq_mat_trans * m6); - VERIFY_IS_APPROX(m6 * hseq_mat, m6 * hseq_mat); - VERIFY_IS_APPROX(m6 * hseq_mat.adjoint(), m6 * hseq_mat_adj); - VERIFY_IS_APPROX(m6 * hseq_mat.conjugate(), m6 * hseq_mat_conj); - VERIFY_IS_APPROX(m6 * hseq_mat.transpose(), m6 * hseq_mat_trans); + VERIFY_IS_APPROX(hseq * m6, hseq_mat * m6); + VERIFY_IS_APPROX(hseq.adjoint() * m6, hseq_mat_adj * m6); + VERIFY_IS_APPROX(hseq.conjugate() * m6, hseq_mat_conj * m6); + VERIFY_IS_APPROX(hseq.transpose() * m6, hseq_mat_trans * m6); + VERIFY_IS_APPROX(m6 * hseq, m6 * hseq_mat); + VERIFY_IS_APPROX(m6 * hseq.adjoint(), m6 * hseq_mat_adj); + VERIFY_IS_APPROX(m6 * hseq.conjugate(), m6 * hseq_mat_conj); + VERIFY_IS_APPROX(m6 * hseq.transpose(), m6 * hseq_mat_trans); // test householder sequence on the right with a shift diff --git a/test/qr.cpp b/test/qr.cpp index dfcc1e8f9..00ef99ef9 100644 --- a/test/qr.cpp +++ b/test/qr.cpp @@ -85,7 +85,7 @@ template void qr_invertible() qr.compute(m1); VERIFY_IS_APPROX(log(absdet), qr.logAbsDeterminant()); // This test is tricky if the determinant becomes too small. - // Since we generate random numbers with magnitude rrange [0,1], the average determinant is 0.5^size + // Since we generate random numbers with magnitude range [0,1], the average determinant is 0.5^size VERIFY_IS_MUCH_SMALLER_THAN( abs(absdet-qr.absDeterminant()), numext::maxi(RealScalar(pow(0.5,size)),numext::maxi(abs(absdet),abs(qr.absDeterminant()))) ); } From 775766d175f52d4aa86268c6e6ff43cee374a56a Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sun, 15 Apr 2018 18:43:56 +0200 Subject: [PATCH 310/680] Add parenthesis to fix compiler warnings --- test/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/main.h b/test/main.h index 14f4e3e7a..0fcd6cb76 100644 --- a/test/main.h +++ b/test/main.h @@ -265,7 +265,7 @@ namespace Eigen { \ Eigen::no_more_assert = true; \ if(report_on_cerr_on_assert_failure) \ - eigen_plain_assert(a && #MSG); \ + eigen_plain_assert((a) && #MSG); \ else \ EIGEN_THROW_X(Eigen::eigen_static_assert_exception()); \ } From 39c2cba810a573ae4d0efd2b0b80e08c934b99b3 Mon Sep 17 00:00:00 2001 From: nicolov Date: Fri, 13 Apr 2018 22:29:10 +0000 Subject: [PATCH 311/680] Add a specialization of Eigen::numext::conj for std::complex to be used when compiling a cuda kernel. This fixes the compilation of TensorFlow 1.4 with clang 6.0 used as CUDA compiler with libc++. This follows the previous change in https://bitbucket.org/eigen/eigen/commits/2a69290ddb165b7103c87ba8f5b257eca23f62aa , which mentions OSX (I guess because it uses libc++ too). --- Eigen/src/Core/MathFunctions.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index ac1b7a6a1..05462c5e1 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -238,7 +238,7 @@ struct imag_ref_retval ****************************************************************************/ template::IsComplex> -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -248,7 +248,7 @@ struct conj_impl }; template -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -258,6 +258,20 @@ struct conj_impl } }; +template struct conj_impl : conj_default_impl {}; + +#ifdef EIGEN_CUDA_ARCH +template +struct conj_impl > +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(x.real(), -x.imag()); + } +}; +#endif + template struct conj_retval { From 50633d1a83eb830803e63ae12596fad9f28db866 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 17 Apr 2018 11:30:27 +0200 Subject: [PATCH 312/680] Renamed .trans() et al. to .reverseFlag() et at. Adapted documentation of .setReverseFlag() --- Eigen/src/Householder/HouseholderSequence.h | 44 +++++++++++---------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 13030c664..fad1d5ab6 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -174,7 +174,7 @@ template class HouseholderS * \sa setLength(), setShift() */ HouseholderSequence(const VectorsType& v, const CoeffsType& h) - : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()), + : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) { } @@ -183,7 +183,7 @@ template class HouseholderS HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), - m_trans(other.m_trans), + m_reverse(other.m_reverse), m_length(other.m_length), m_shift(other.m_shift) { @@ -225,7 +225,7 @@ template class HouseholderS TransposeReturnType transpose() const { return TransposeReturnType(m_vectors.conjugate(), m_coeffs) - .setTrans(!m_trans) + .setReverseFlag(!m_reverse) .setLength(m_length) .setShift(m_shift); } @@ -234,7 +234,7 @@ template class HouseholderS ConjugateReturnType conjugate() const { return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate()) - .setTrans(m_trans) + .setReverseFlag(m_reverse) .setLength(m_length) .setShift(m_shift); } @@ -243,7 +243,7 @@ template class HouseholderS AdjointReturnType adjoint() const { return AdjointReturnType(m_vectors, m_coeffs.conjugate()) - .setTrans(!m_trans) + .setReverseFlag(!m_reverse) .setLength(m_length) .setShift(m_shift); } @@ -273,7 +273,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -293,7 +293,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -317,7 +317,7 @@ template class HouseholderS workspace.resize(dst.rows()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? m_length-k-1 : k; + Index actual_k = m_reverse ? m_length-k-1 : k; dst.rightCols(rows()-m_shift-actual_k) .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } @@ -340,8 +340,8 @@ template class HouseholderS { for(Index i = 0; i < m_length; i+=BlockSize) { - Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i; - Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize); + Index end = m_reverse ? (std::min)(m_length,i+BlockSize) : m_length-i; + Index k = m_reverse ? i : (std::max)(Index(0),end-BlockSize); Index bs = end-k; Index start = k + m_shift; @@ -352,7 +352,7 @@ template class HouseholderS Side==OnTheRight ? m_vectors.cols()-start : bs); typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); Block sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); } } else @@ -360,7 +360,7 @@ template class HouseholderS workspace.resize(dst.cols()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? k : m_length-k-1; + Index actual_k = m_reverse ? k : m_length-k-1; dst.bottomRows(rows()-m_shift-actual_k) .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } @@ -425,25 +425,27 @@ template class HouseholderS protected: - /** \brief Sets the transpose flag. - * \param [in] trans New value of the transpose flag. + /** \internal + * \brief Sets the reverse flag. + * \param [in] reverse New value of the reverse flag. * - * By default, the transpose flag is not set. If the transpose flag is set, then this object represents - * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * By default, the reverse flag is not set. If the reverse flag is set, then this object represents + * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$. * - * \sa trans() + * \sa reverseFlag(), transpose(), adjoint() */ - HouseholderSequence& setTrans(bool trans) + HouseholderSequence& setReverseFlag(bool reverse) { - m_trans = trans; + m_reverse = reverse; return *this; } - bool trans() const { return m_trans; } /**< \brief Returns the transpose flag. */ + bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */ typename VectorsType::Nested m_vectors; typename CoeffsType::Nested m_coeffs; - bool m_trans; + bool m_reverse; Index m_length; Index m_shift; }; From 02d2f1cb4ad6fdcc38d691c31a4b0de043e17654 Mon Sep 17 00:00:00 2001 From: Dmitriy Korchemkin Date: Wed, 18 Apr 2018 13:52:46 +0300 Subject: [PATCH 313/680] Cast zeros to Scalar in RealSchur --- Eigen/src/Eigenvalues/RealSchur.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 9e71f3040..b72799e5b 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -303,7 +303,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); - if(norm!=0) + if(norm!=Scalar(0)) { while (iu >= 0) { @@ -327,7 +327,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa else // No convergence yet { // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG ) - Vector3s firstHouseholderVector(0,0,0), shiftInfo; + Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo; computeShift(iu, iter, exshift, shiftInfo); iter = iter + 1; totalIter = totalIter + 1; From 686fb57233736583c7ff4462aac403591ba762b9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 18 Apr 2018 18:46:34 +0200 Subject: [PATCH 314/680] fix const cast in NEON --- Eigen/src/Core/arch/NEON/Complex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index ef50ba303..110371ba8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -67,7 +67,7 @@ template<> struct unpacket_traits { typedef std::complex type; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { float32x2_t r64; - r64 = vld1_f32((float *)&from); + r64 = vld1_f32((const float *)&from); return Packet2cf(vcombine_f32(r64, r64)); } @@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -383,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { From b7b868d1c43b1ac0c7367fedb338dfd726a69ee3 Mon Sep 17 00:00:00 2001 From: Jayaram Bobba Date: Fri, 20 Apr 2018 13:39:18 -0700 Subject: [PATCH 315/680] fix AVX512 plog --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 61434a33e..9c1717f76 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -88,9 +88,9 @@ plog(const Packet16f& _x) { // x = x + x - 1.0; // } else { x = x - 1.0; } __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); + Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x); x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); + e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1)); x = padd(x, tmp); Packet16f x2 = pmul(x, x); @@ -119,8 +119,9 @@ plog(const Packet16f& _x) { x = padd(x, y2); // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, - _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); + return _mm512_mask_blend_ps(iszero_mask, + _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), + p16f_minus_inf); } #endif From 34e499ad36363692615b800b5dbe6cd496341339 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sat, 21 Apr 2018 22:08:26 +0200 Subject: [PATCH 316/680] Disable -Wshadow when compiling with g++ --- CMakeLists.txt | 6 +----- Eigen/src/Core/util/DisableStupidWarnings.h | 8 ++++++-- Eigen/src/Core/util/ReenableStupidWarnings.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a068739b..5245b2930 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,11 +153,7 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wdouble-promotion") # ei_add_cxx_compiler_flag("-Wconversion") - # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 - # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0")) - if(NOT CMAKE_COMPILER_IS_GNUCXX) - ei_add_cxx_compiler_flag("-Wshadow") - endif() + ei_add_cxx_compiler_flag("-Wshadow") ei_add_cxx_compiler_flag("-Wno-psabi") ei_add_cxx_compiler_flag("-Wno-variadic-macros") diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 8ef0f3594..af832ae77 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -45,12 +45,16 @@ #pragma clang diagnostic ignored "-Wabsolute-value" #endif -#elif defined __GNUC__ && __GNUC__>=6 +#elif defined __GNUC__ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma GCC diagnostic push #endif - #pragma GCC diagnostic ignored "-Wignored-attributes" + // g++ warns about local variables shadowing member functions, which is too strict + #pragma GCC diagnostic ignored "-Wshadow" + #if __GNUC__>=6 + #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif #endif diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index 86b60f52f..e9353a623 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -8,7 +8,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && __GNUC__>=6 + #elif defined __GNUC__ #pragma GCC diagnostic pop #endif From 35b31353abebbbd823457c58d00d85c9f65e8a30 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 22 Apr 2018 22:49:08 +0200 Subject: [PATCH 317/680] Fix unit test --- test/permutationmatrices.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp index 0ed7a4b2b..b2229cfe9 100644 --- a/test/permutationmatrices.cpp +++ b/test/permutationmatrices.cpp @@ -124,7 +124,7 @@ template void permutationmatrices(const MatrixType& m) lp = lt; rp = rt; VERIFY_EVALUATION_COUNT(m_permuted = lt * m_permuted * rt, 1); - VERIFY_IS_APPROX(m_permuted, lp*m_original*rp); + VERIFY_IS_APPROX(m_permuted, lp*m_original*rp.transpose()); VERIFY_IS_APPROX(lt.inverse()*m_permuted*rt.inverse(), m_original); } From 5679e439e0c87e4b93ad7870f463d5168edcd57d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 23 Apr 2018 14:40:16 +0200 Subject: [PATCH 318/680] bug #1543: fix linear indexing in generic block evaluation (this completes the fix in commit 12efc7d41b80259b996be5781bf596c249c90d3f ) --- Eigen/src/Core/CoreEvaluators.h | 11 ++++++----- test/diagonalmatrices.cpp | 6 ++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 8419798c1..6231f2edd 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1087,7 +1087,8 @@ struct unary_evaluator, IndexBa typedef typename XprType::CoeffReturnType CoeffReturnType; enum { - RowsAtCompileTime = XprType::RowsAtCompileTime + RowsAtCompileTime = XprType::RowsAtCompileTime, + ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1099,7 +1100,7 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (InnerPanel) + if (ForwardLinearAccess) return m_argImpl.coeff(m_linear_offset.value() + index); else return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); @@ -1114,7 +1115,7 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - if (InnerPanel) + if (ForwardLinearAccess) return m_argImpl.coeffRef(m_linear_offset.value() + index); else return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); @@ -1131,7 +1132,7 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE PacketType packet(Index index) const { - if (InnerPanel) + if (ForwardLinearAccess) return m_argImpl.template packet(m_linear_offset.value() + index); else return packet(RowsAtCompileTime == 1 ? 0 : index, @@ -1149,7 +1150,7 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - if (InnerPanel) + if (ForwardLinearAccess) return m_argImpl.template writePacket(m_linear_offset.value() + index, x); else return writePacket(RowsAtCompileTime == 1 ? 0 : index, diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index cd6dc8cf0..a2092c43e 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -99,6 +99,12 @@ template void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() ); VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() ); VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() ); + + sq_m1.setRandom(); + sq_m2 = v1.asDiagonal(); + sq_m2 = sq_m1 * sq_m2; + VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) ); + VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) ); } template From 11123175dbe49a9b17335f67f093ef04a2779d49 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 23 Apr 2018 15:30:35 +0200 Subject: [PATCH 319/680] typo in doc --- Eigen/src/Core/VectorBlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h index d72fbf7e9..0ede5d58e 100644 --- a/Eigen/src/Core/VectorBlock.h +++ b/Eigen/src/Core/VectorBlock.h @@ -35,7 +35,7 @@ struct traits > * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and * most of the time this is the only way it is used. * - * However, if you want to directly maniputate sub-vector expressions, + * However, if you want to directly manipulate sub-vector expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. * From a57e6e5f0fa8ca5cb621bb07a2a72cbbdf98e300 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 23 Apr 2018 15:31:51 +0200 Subject: [PATCH 320/680] workaround MSVC 2013 compilation issue (ambiguous call) --- Eigen/src/SparseCore/SparseSelfAdjointView.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 5ab64f1a8..65611b3d4 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -311,7 +311,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons while (i && i.index() Date: Mon, 23 Apr 2018 16:26:29 +0200 Subject: [PATCH 321/680] Add specializations of is_arithmetic for long long in c++11 --- Eigen/src/Core/util/Meta.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 1180f0c13..0c7b8e709 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -98,6 +98,8 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; #if EIGEN_HAS_CXX11 +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; using std::is_integral; #else template struct is_integral { enum { value = false }; }; From e8ca5166a921af2ed7df1ddda12fbc9316b2b2cf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 24 Apr 2018 11:19:49 +0200 Subject: [PATCH 322/680] bug #1428: atempt to make NEON vectorization compilable by MSVC. The workaround is to wrap NEON packet types to make them different c++ types. --- Eigen/src/Core/arch/NEON/PacketMath.h | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6283b6cfa..010739380 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -36,12 +36,43 @@ namespace internal { #endif #endif +#if EIGEN_COMP_MSVC + +// In MSVC's arm_neon.h header file, all NEON vector types +// are aliases to the same underlying type __n128. +// We thus have to wrap them to make them different C++ types. +// (See also bug 1428) + +template +struct eigen_packet_wrapper +{ + operator T&() { return m_val; } + operator const T&() const { return m_val; } + eigen_packet_wrapper() {} + eigen_packet_wrapper(const T &v) : m_val(v) {} + eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4ui; + +#else + typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; +#endif // EIGEN_COMP_MSVC + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) From 3ffd449ef5f3e2e3d08dc3b683be3a2f291b6e70 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 24 Apr 2018 17:11:51 +0200 Subject: [PATCH 323/680] Workaround warning --- unsupported/test/autodiff_scalar.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index 9cf11280c..a917ec344 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -81,6 +81,9 @@ void check_limits_specialization() typedef std::numeric_limits A; typedef std::numeric_limits B; + // workaround "unsed typedef" warning: + VERIFY(!bool(internal::is_same::value)); + #if EIGEN_HAS_CXX11 VERIFY(bool(std::is_base_of::value)); #endif From 2f3287da7df977e1e5faae40cf0276e83369da97 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 24 Apr 2018 17:17:25 +0200 Subject: [PATCH 324/680] Fix "used uninitialized" warnings --- unsupported/test/autodiff.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 85743137e..1c5e0dc66 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -306,6 +306,8 @@ double bug_1222() { return denom.value(); } +#ifdef EIGEN_TEST_PART_5 + double bug_1223() { using std::min; typedef Eigen::AutoDiffScalar AD; @@ -326,8 +328,8 @@ double bug_1223() { // regression test for some compilation issues with specializations of ScalarBinaryOpTraits void bug_1260() { - Matrix4d A; - Vector4d v; + Matrix4d A = Matrix4d::Ones(); + Vector4d v = Vector4d::Ones(); A*v; } @@ -336,7 +338,7 @@ double bug_1261() { typedef AutoDiffScalar AD; typedef Matrix VectorAD; - VectorAD v; + VectorAD v(0.,0.); const AD maxVal = v.maxCoeff(); const AD minVal = v.minCoeff(); return maxVal.value() + minVal.value(); @@ -344,12 +346,14 @@ double bug_1261() { double bug_1264() { typedef AutoDiffScalar AD; - const AD s; - const Matrix v1; + const AD s = 0.; + const Matrix v1(0.,0.,0.); const Matrix v2 = (s + 3.0) * v1; return v2(0).value(); } +#endif + void test_autodiff() { for(int i = 0; i < g_repeat; i++) { @@ -359,9 +363,9 @@ void test_autodiff() CALL_SUBTEST_4( test_autodiff_hessian<1>() ); } - bug_1222(); - bug_1223(); - bug_1260(); - bug_1261(); + CALL_SUBTEST_5( bug_1222() ); + CALL_SUBTEST_5( bug_1223() ); + CALL_SUBTEST_5( bug_1260() ); + CALL_SUBTEST_5( bug_1261() ); } From 8810baaed40f554ff59db65d5f5025ef5247c3c2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 25 Apr 2018 10:14:48 +0200 Subject: [PATCH 325/680] Add multi-threading for sparse-row-major * dense-row-major --- Eigen/src/SparseCore/SparseDenseProduct.h | 38 ++++++++++++++++++----- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h index 0547db596..f005a18a1 100644 --- a/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/Eigen/src/SparseCore/SparseDenseProduct.h @@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { - evaluator lhsEval(lhs); + LhsEval lhsEval(lhs); for(Index c=0; c::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { - evaluator lhsEval(lhs); - for(Index j=0; j1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) { - typename Res::RowXpr res_j(res.row(j)); - for(LhsInnerIterator it(lhsEval,j); it ;++it) - res_j += (alpha*it.value()) * rhs.row(it.index()); + #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) + for(Index i=0; i Date: Thu, 26 Apr 2018 10:47:39 +0200 Subject: [PATCH 326/680] Fix unit test for SIMD engine not supporting sqrt --- test/packetmath.cpp | 56 +++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 53838920e..4b19be92d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -451,35 +451,41 @@ template void packetmath_real() { data1[0] = std::numeric_limits::quiet_NaN(); data1[1] = std::numeric_limits::epsilon(); - packet_helper h; - h.store(data2, internal::plog(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY_IS_EQUAL(std::log(std::numeric_limits::epsilon()), data2[1]); + { + packet_helper h; + h.store(data2, internal::plog(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + VERIFY_IS_EQUAL(std::log(std::numeric_limits::epsilon()), data2[1]); - data1[0] = -std::numeric_limits::epsilon(); - data1[1] = 0; - h.store(data2, internal::plog(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]); + data1[0] = -std::numeric_limits::epsilon(); + data1[1] = 0; + h.store(data2, internal::plog(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]); - data1[0] = (std::numeric_limits::min)(); - data1[1] = -(std::numeric_limits::min)(); - h.store(data2, internal::plog(h.load(data1))); - VERIFY_IS_EQUAL(std::log((std::numeric_limits::min)()), data2[0]); - VERIFY((numext::isnan)(data2[1])); + data1[0] = (std::numeric_limits::min)(); + data1[1] = -(std::numeric_limits::min)(); + h.store(data2, internal::plog(h.load(data1))); + VERIFY_IS_EQUAL(std::log((std::numeric_limits::min)()), data2[0]); + VERIFY((numext::isnan)(data2[1])); - data1[0] = std::numeric_limits::denorm_min(); - data1[1] = -std::numeric_limits::denorm_min(); - h.store(data2, internal::plog(h.load(data1))); - // VERIFY_IS_EQUAL(std::log(std::numeric_limits::denorm_min()), data2[0]); - VERIFY((numext::isnan)(data2[1])); + data1[0] = std::numeric_limits::denorm_min(); + data1[1] = -std::numeric_limits::denorm_min(); + h.store(data2, internal::plog(h.load(data1))); + // VERIFY_IS_EQUAL(std::log(std::numeric_limits::denorm_min()), data2[0]); + VERIFY((numext::isnan)(data2[1])); - data1[0] = Scalar(-1.0f); - h.store(data2, internal::plog(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - h.store(data2, internal::psqrt(h.load(data1))); - VERIFY((numext::isnan)(data2[0])); - VERIFY((numext::isnan)(data2[1])); + data1[0] = Scalar(-1.0f); + h.store(data2, internal::plog(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + } + { + packet_helper h; + data1[0] = Scalar(-1.0f); + h.store(data2, internal::psqrt(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + VERIFY((numext::isnan)(data2[1])); + } } } From 6e7118265dc47e07107859c0cdba7eb37e2200e7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Apr 2018 10:50:41 +0200 Subject: [PATCH 327/680] Fix compilation with NEON+MSVC --- Eigen/src/Core/arch/NEON/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 110371ba8..306a309be 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -277,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } EIGEN_DEVICE_FUNC inline void From 6118c6ff4f898dad99564ffdeb99036beb2ff0ea Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 30 Apr 2018 11:28:12 -0700 Subject: [PATCH 328/680] Enable RawAccess to tensor slices whenever possinle. Avoid 32-bit integer overflow in TensorSlicingOp --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 329655817..e59074506 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -398,7 +398,7 @@ struct TensorEvaluator, Devi const MemcpyTriggerForSlicing trigger(m_device); if (trigger(contiguous_values)) { Scalar* src = (Scalar*)m_impl.data(); - for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); } @@ -559,7 +559,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, - RawAccess = false + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) From b8c8e5f436743ac6a6a5ed0ad7ec5cce7dd00248 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 14 May 2018 16:07:13 -0700 Subject: [PATCH 329/680] Add vectorized clip functor for Eigen Tensors. --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 6 ++++++ .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 19 +++++++++++++++++ unsupported/test/cxx11_tensor_expr.cpp | 21 +++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 1d459a3a0..c8dc3bbc5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -209,6 +209,12 @@ class TensorBase return unaryExpr(internal::scalar_abs_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + clip(Scalar min, Scalar max) const { + return unaryExpr(internal::scalar_clip_op(min, max)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> conjugate() const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 5dcc3794c..4cee38339 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -487,6 +487,25 @@ struct functor_traits > { }; }; +template +struct scalar_clip_op { + EIGEN_DEVICE_FUNC inline scalar_clip_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& x) const { + return numext::mini(numext::maxi(x, m_min), m_max); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet + packetOp(const Packet& x) const { + return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); + } + const Scalar m_min; + const Scalar m_max; +}; +template +struct functor_traits > +{ enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 129b4e659..30e5a9860 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -340,6 +340,26 @@ void test_minmax_nan_propagation_templ() { } } +static void test_clip() +{ + Tensor vec(6); + vec(0) = 4.0; + vec(1) = 8.0; + vec(2) = 15.0; + vec(3) = 16.0; + vec(4) = 23.0; + vec(5) = 42.0; + + float kMin = 20; + float kMax = 30; + + Tensor vec_clipped(6); + vec_clipped = vec.clip(kMin, kMax); + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(vec_clipped(i), std::min(std::max(vec(i), kMin), kMax)); + } +} + static void test_minmax_nan_propagation() { test_minmax_nan_propagation_templ(); @@ -356,5 +376,6 @@ void test_cxx11_tensor_expr() CALL_SUBTEST(test_functors()); CALL_SUBTEST(test_type_casting()); CALL_SUBTEST(test_select()); + CALL_SUBTEST(test_clip()); CALL_SUBTEST(test_minmax_nan_propagation()); } From afec3021f731d053254fe64983fc0623fff507f9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 14 May 2018 16:35:39 -0700 Subject: [PATCH 330/680] Use numext::maxi & numext::mini. --- unsupported/test/cxx11_tensor_expr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 30e5a9860..d64301318 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -356,7 +356,7 @@ static void test_clip() Tensor vec_clipped(6); vec_clipped = vec.clip(kMin, kMax); for (int i = 0; i < 6; ++i) { - VERIFY_IS_EQUAL(vec_clipped(i), std::min(std::max(vec(i), kMin), kMax)); + VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax)); } } From 0272f2451a1c85e0d74235ac8f5c468776171580 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 15 May 2018 19:35:53 +0200 Subject: [PATCH 331/680] Fix "suggest parentheses around comparison" warning --- Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 9176a1382..49565c070 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -37,7 +37,7 @@ namespace Eigen { namespace internal { -template +template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; @@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product& blocking) \ { \ - if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ From 812480baa340f2d8b0bc21b3f2d227168280ad06 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 16 May 2018 09:49:24 -0700 Subject: [PATCH 332/680] Rename scalar_clip_op to scalar_clip2_op to prevent collision with existing functor in TensorFlow. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index c8dc3bbc5..497e2a797 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -210,9 +210,9 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> clip(Scalar min, Scalar max) const { - return unaryExpr(internal::scalar_clip_op(min, max)); + return unaryExpr(internal::scalar_clip2_op(min, max)); } EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 4cee38339..adadb5afc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -488,8 +488,8 @@ struct functor_traits > { }; template -struct scalar_clip_op { - EIGEN_DEVICE_FUNC inline scalar_clip_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} +struct scalar_clip2_op { + EIGEN_DEVICE_FUNC inline scalar_clip2_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { return numext::mini(numext::maxi(x, m_min), m_max); @@ -503,7 +503,7 @@ struct scalar_clip_op { const Scalar m_max; }; template -struct functor_traits > +struct functor_traits > { enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; } // end namespace internal From b8d36774faafa89ffc016c2c35f7e906035b2fa5 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 16 May 2018 14:04:48 -0700 Subject: [PATCH 333/680] Rename clip2 to clamp. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 497e2a797..745b16d2c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -210,9 +210,9 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> clip(Scalar min, Scalar max) const { - return unaryExpr(internal::scalar_clip2_op(min, max)); + return unaryExpr(internal::scalar_clamp_op(min, max)); } EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index adadb5afc..0efcd6253 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -488,8 +488,8 @@ struct functor_traits > { }; template -struct scalar_clip2_op { - EIGEN_DEVICE_FUNC inline scalar_clip2_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} +struct scalar_clamp_op { + EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { return numext::mini(numext::maxi(x, m_min), m_max); @@ -503,7 +503,7 @@ struct scalar_clip2_op { const Scalar m_max; }; template -struct functor_traits > +struct functor_traits > { enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; } // end namespace internal From d6559009530b87f80fc5aaa864c012ae5109e848 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 17 May 2018 19:17:01 +0200 Subject: [PATCH 334/680] bug #1544: Generate correct Q matrix in complex case. Original patch was by Jeff Trull in PR-386. --- Eigen/src/SparseQR/SparseQR.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index f7111fe2e..278d775aa 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -52,7 +52,7 @@ namespace internal { * rank-revealing permutations. Use colsPermutation() to get it. * * Q is the orthogonal matrix represented as products of Householder reflectors. - * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose. + * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint. * You can then apply it to a vector. * * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient. @@ -65,6 +65,7 @@ namespace internal { * \implsparsesolverconcept * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). + * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * */ template @@ -196,9 +197,9 @@ class SparseQR : public SparseSolverBase > Index rank = this->rank(); - // Compute Q^T * b; + // Compute Q^* * b; typename Dest::PlainObject y, b; - y = this->matrixQ().transpose() * B; + y = this->matrixQ().adjoint() * B; b = y; // Solve with the triangular matrix R @@ -641,7 +642,7 @@ struct SparseQR_QProduct : ReturnByValue @@ -668,13 +669,14 @@ struct SparseQRMatrixQReturnType : public EigenBase(m_qr,other.derived(),false); } + // To use for operations with the adjoint of Q SparseQRMatrixQTransposeReturnType adjoint() const { return SparseQRMatrixQTransposeReturnType(m_qr); } inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); } - // To use for operations with the transpose of Q + // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment SparseQRMatrixQTransposeReturnType transpose() const { return SparseQRMatrixQTransposeReturnType(m_qr); @@ -682,6 +684,7 @@ struct SparseQRMatrixQReturnType : public EigenBase struct SparseQRMatrixQTransposeReturnType { From 9f0c5c3669a985a0dfa44db2ab05306edf1130c1 Mon Sep 17 00:00:00 2001 From: Jeff Trull Date: Thu, 15 Feb 2018 15:00:31 -0800 Subject: [PATCH 335/680] Make sparse QR result sizes consistent with dense QR, with the following rules: 1) Q is always square 2) Q*R*P' is valid and recovers the original matrix This implies that the size of Q is the number of rows in the original matrix, square, and that the size of R is the size of the original matrix. --- Eigen/src/SparseQR/SparseQR.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index 278d775aa..7409fcae9 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -328,7 +328,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data()); m_isEtreeOk = true; - m_R.resize(diagSize, n); + m_R.resize(m, n); m_Q.resize(m, diagSize); // Allocate space for nonzero elements : rough estimation @@ -605,7 +605,7 @@ struct SparseQR_QProduct : ReturnByValue(m_qr); } inline Index rows() const { return m_qr.rows(); } - inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); } + inline Index cols() const { return m_qr.rows(); } // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment SparseQRMatrixQTransposeReturnType transpose() const { @@ -715,7 +718,7 @@ struct Assignment, internal: typedef typename DstXprType::StorageIndex StorageIndex; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { - typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows()); + typename DstXprType::PlainObject idMat(src.rows(), src.cols()); idMat.setIdentity(); // Sort the sparse householder reflectors if needed const_cast(&src.m_qr)->_sort_matrix_Q(); From b2053990d078e15017610d526445ce78f2632edf Mon Sep 17 00:00:00 2001 From: Robert Lukierski Date: Wed, 14 Mar 2018 16:19:43 +0000 Subject: [PATCH 336/680] Adding EIGEN_DEVICE_FUNC to Products, especially Dense2Dense Assignment specializations. Otherwise causes problems with small fixed size matrix multiplication (call to 0x00 in call_assignment_no_alias in debug mode or trap in release with CUDA 9.1). --- Eigen/src/Core/ProductEvaluators.h | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index fb637191d..5d2737d01 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -137,7 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -155,7 +155,7 @@ struct Assignment, internal::add_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -170,7 +170,7 @@ struct Assignment, internal::sub_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -190,7 +190,7 @@ struct Assignment, const CwiseNullaryOp,Plain>, const Product > SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) { call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); @@ -217,7 +217,7 @@ template - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) { call_assignment_no_alias(dst, src.lhs(), Func1()); @@ -246,19 +246,19 @@ template struct generic_product_impl { template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -269,7 +269,7 @@ struct generic_product_impl // Column major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); typename nested_eval::type actual_lhs(lhs); @@ -282,7 +282,7 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const // Row major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); typename nested_eval::type actual_rhs(rhs); @@ -300,37 +300,37 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose - struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; - struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; - struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct set { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct adds { Scalar m_scale; explicit adds(const Scalar& s) : m_scale(s) {} - template void operator()(const Dst& dst, const Src& src) const { + template void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += m_scale * src; } }; template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } @@ -345,15 +345,15 @@ struct generic_product_impl_base typedef typename Product::Scalar Scalar; template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } template @@ -373,7 +373,7 @@ struct generic_product_impl typedef typename internal::remove_all::type>::type MatrixType; template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { LhsNested actual_lhs(lhs); RhsNested actual_rhs(rhs); @@ -390,7 +390,7 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side @@ -398,14 +398,14 @@ struct generic_product_impl } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() += lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); From e7147f69ae3282b7ae9feb80ffdcc4ae432c68e0 Mon Sep 17 00:00:00 2001 From: Jeff Trull Date: Sat, 21 Apr 2018 10:26:30 -0700 Subject: [PATCH 337/680] Add tests for sparseQR results (value and size) covering bugs #1522 and #1544 --- test/sparseqr.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/sparseqr.cpp b/test/sparseqr.cpp index e8605fd21..f0e721fce 100644 --- a/test/sparseqr.cpp +++ b/test/sparseqr.cpp @@ -54,6 +54,28 @@ template void test_sparseqr_scalar() b = dA * DenseVector::Random(A.cols()); solver.compute(A); + + // Q should be MxM + VERIFY_IS_EQUAL(solver.matrixQ().rows(), A.rows()); + VERIFY_IS_EQUAL(solver.matrixQ().cols(), A.rows()); + + // R should be MxN + VERIFY_IS_EQUAL(solver.matrixR().rows(), A.rows()); + VERIFY_IS_EQUAL(solver.matrixR().cols(), A.cols()); + + // Q and R can be multiplied + DenseMat recoveredA = solver.matrixQ() + * DenseMat(solver.matrixR().template triangularView()) + * solver.colsPermutation().transpose(); + VERIFY_IS_EQUAL(recoveredA.rows(), A.rows()); + VERIFY_IS_EQUAL(recoveredA.cols(), A.cols()); + + // and in the full rank case the original matrix is recovered + if (solver.rank() == A.cols()) + { + VERIFY_IS_APPROX(A, recoveredA); + } + if(internal::random(0,1)>0.5f) solver.factorize(A); // this checks that calling analyzePattern is not needed if the pattern do not change. if (solver.info() != Success) From 7134fa7a2eea469f35ea12899e693a633b5b42e5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 7 Jun 2018 09:33:10 +0200 Subject: [PATCH 338/680] Fix compilation with MSVC by reverting to char* for _mm_prefetch except for PGI (the later being the one that has the wrong prototype). --- Eigen/src/Core/arch/AVX/PacketMath.h | 6 +++--- Eigen/src/Core/arch/AVX512/PacketMath.h | 6 +++--- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 12 +++++++++--- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 59aebd912..774e64981 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) } #ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index eb5de43d4..4e2e916de 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -604,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { pstore(to, pa); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template <> EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index c618cfaaa..d075043ce 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -324,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index d1a7c65be..c944d2c0e 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#if EIGEN_COMP_PGI +typedef const void * SsePrefetchPtrType; +#else +typedef const char * SsePrefetchPtrType; +#endif + #ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 From 6293ad3f392a7b97ebb9f9f874682505c1391f2d Mon Sep 17 00:00:00 2001 From: Vamsi Sripathi Date: Wed, 23 May 2018 14:02:05 -0700 Subject: [PATCH 339/680] Performance improvements to tensor broadcast operation 1. Added new packet functions using SIMD for NByOne, OneByN cases 2. Modified existing packet functions to reduce index calculations when input stride is non-SIMD 3. Added 4 test cases to cover the new packet functions --- .../CXX11/src/Tensor/TensorBroadcasting.h | 110 +++++++++++++++++- .../test/cxx11_tensor_broadcasting.cpp | 62 ++++++++++ 2 files changed, 168 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6c93aff9..9ab6b3565 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -105,6 +105,7 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; + bool nByOne = false, oneByN = false; enum { IsAligned = true, @@ -142,6 +143,24 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; } } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims-1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims-1; ++i) { + if (broadcast[i] != 1) { + nByOne = false; + break; + } + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -237,9 +256,84 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); + if (oneByN) { + return packetNByOne(index); + } else if (nByOne) { + return packetOneByN(index); + } else { + return packetColMajor(index); + } } else { - return packetRowMajor(index); + if (oneByN) { + return packetOneByN(index); + } else if (nByOne) { + return packetNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index dim, inputIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = NumDims - 1; + } else { + dim = 0; + } + + inputIndex = index % m_inputStrides[dim]; + if (inputIndex + PacketSize <= m_inputStrides[dim]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > m_inputStrides[dim]-1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index dim, inputIndex, outputOffset; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = 1; + } else { + dim = NumDims - 2; + } + + inputIndex = index / m_outputStrides[dim]; + outputOffset = index % m_outputStrides[dim]; + if (outputOffset + PacketSize <= m_outputStrides[dim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[dim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); } } @@ -290,7 +384,11 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffColMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -342,7 +440,11 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffRowMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 5c0ea5889..a9d268ea6 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -180,6 +180,64 @@ static void test_fixed_size_broadcasting() #endif } +template +static void test_simple_broadcasting_one_by_n() +{ + Tensor tensor(1,13,5,7); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 9; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 1; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 9); + VERIFY_IS_EQUAL(broadcast.dimension(1), 13); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 7); + + for (int i = 0; i < 9; ++i) { + for (int j = 0; j < 13; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l)); + } + } + } + } +} + +template +static void test_simple_broadcasting_n_by_one() +{ + Tensor tensor(7,3,5,1); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 1; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 19; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 7); + VERIFY_IS_EQUAL(broadcast.dimension(1), 3); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 19); + + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 19; ++l) { + VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l)); + } + } + } + } +} + void test_cxx11_tensor_broadcasting() { @@ -191,4 +249,8 @@ void test_cxx11_tensor_broadcasting() CALL_SUBTEST(test_static_broadcasting()); CALL_SUBTEST(test_fixed_size_broadcasting()); CALL_SUBTEST(test_fixed_size_broadcasting()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n()); + CALL_SUBTEST(test_simple_broadcasting_n_by_one()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n()); + CALL_SUBTEST(test_simple_broadcasting_n_by_one()); } From 405859f18dac56f324e1d93ca8721d5f7fd22c62 Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Thu, 17 May 2018 17:04:00 +0100 Subject: [PATCH 340/680] Set EIGEN_IDEAL_MAX_ALIGN_BYTES correctly for AVX512 builds bug #1548 The macro EIGEN_IDEAL_MAX_ALIGN_BYTES is being incorrectly set to 32 on AVX512 builds. It should be set to 64. In the current code it is only set to 64 if the macro EIGEN_VECTORIZE_AVX512 is defined. This macro does get defined in AVX512 builds in Core, but only after Macros.h, the file that defines EIGEN_IDEAL_MAX_ALIGN_BYTES, has been included. This commit fixes the issue by setting EIGEN_IDEAL_MAX_ALIGN_BYTES to 64 if __AVX512F__ is defined. --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9c68ecb7d..729fb3b1d 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -702,7 +702,7 @@ namespace Eigen { // If the user explicitly disable vectorization, then we also disable alignment #if defined(EIGEN_DONT_VECTORIZE) #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 -#elif defined(EIGEN_VECTORIZE_AVX512) +#elif defined(__AVX512F__) // 64 bytes static alignment is preferred only if really required #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 #elif defined(__AVX__) From 345c0ab450d623b79868ea60253db9d0d8bdd5c7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 May 2018 13:46:46 +0200 Subject: [PATCH 341/680] check that all integer types are properly handled by mat(i,j) --- test/indexed_view.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 8b3082cea..71de60d84 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -371,6 +371,21 @@ void check_indexed_view() a(X) = 1; A(X,Y) = 1; + // Check compilation of varying integer types as index types: + Index i = n/2; + short i_short(i); + std::size_t i_sizet(i); + VERIFY_IS_EQUAL( a(i), a.coeff(i_short) ); + VERIFY_IS_EQUAL( a(i), a.coeff(i_sizet) ); + + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i_short) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_short) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_sizet) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i_short) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(5, i_sizet) ); + } void test_indexed_view() From 4dd767f455f1adfea7cb2febeab3efaa81246845 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 May 2018 13:59:55 +0200 Subject: [PATCH 342/680] add some internal checks --- Eigen/src/Core/util/XprHelper.h | 6 ++++++ Eigen/src/plugins/IndexedViewMethods.h | 11 +++-------- test/indexed_view.cpp | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 10328be0d..1217404d6 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -47,6 +47,12 @@ template struct is_valid_index_type }; }; +// true if both types are not valid index types +template +struct valid_indexed_view_overload { + enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; +}; + // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression. diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 9ad2d9aee..d435aa597 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -53,11 +53,6 @@ ivcSize(const Indices& indices) const { return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().size()),Specialized); } -template -struct valid_indexed_view_overload { - enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; -}; - public: #endif @@ -72,7 +67,7 @@ struct EIGEN_INDEXED_VIEW_METHOD_TYPE { // This is the generic version template -typename internal::enable_if::value +typename internal::enable_if::value && internal::traits::type>::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST @@ -84,7 +79,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND // The following overload returns a Block<> object template -typename internal::enable_if::value +typename internal::enable_if::value && internal::traits::type>::ReturnAsBlock, typename internal::traits::type>::BlockType>::type operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST @@ -102,7 +97,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND // The following overload returns a Scalar template -typename internal::enable_if::value +typename internal::enable_if::value && internal::traits::type>::ReturnAsScalar, CoeffReturnType >::type operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 71de60d84..033d8833f 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -395,4 +395,19 @@ void test_indexed_view() CALL_SUBTEST_2( check_indexed_view() ); CALL_SUBTEST_3( check_indexed_view() ); // } + + // static checks of some internals: + + #define STATIC_CHECK( COND ) \ + EIGEN_STATIC_ASSERT( (COND) , EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT ) + + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); } From a382bc9364b5d5a049678f8cc138670ca1c7a4fa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 May 2018 17:02:27 +0200 Subject: [PATCH 343/680] is_convertible does not seems to work well with MSVC 2013, so let's rather use __is_enum(T) for old MSVC versions --- Eigen/src/Core/util/XprHelper.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 1217404d6..9ffb0474b 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -39,10 +39,12 @@ template struct is_valid_index_type { enum { value = #if EIGEN_HAS_TYPE_TRAITS - internal::is_integral::value || std::is_enum::value + internal::is_integral::value || std::is_enum::value +#else if EIGEN_COMP_MSVC >= 1500 + internal::is_integral::value || __is_enum(T) #else - // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. - internal::is_convertible::value + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible::value #endif }; }; From 725bd929038300c89c079325d1989e542beaf23c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 18 May 2018 17:46:43 +0200 Subject: [PATCH 344/680] fix stupid typo --- Eigen/src/Core/util/XprHelper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 9ffb0474b..16714fa5c 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -40,7 +40,7 @@ template struct is_valid_index_type enum { value = #if EIGEN_HAS_TYPE_TRAITS internal::is_integral::value || std::is_enum::value -#else if EIGEN_COMP_MSVC >= 1500 +#elif EIGEN_COMP_MSVC internal::is_integral::value || __is_enum(T) #else // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. From 36e413a534e105ec3864b1a163e0ec3234f866a1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 May 2018 18:51:35 +0200 Subject: [PATCH 345/680] Workaround a MSVC 2013 compilation issue with MatrixBase(Index,int) --- Eigen/src/SparseLU/SparseLU.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 383a203b4..3bc06f4e3 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -747,8 +747,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } else { + // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); } From f0862b062fcb613ee6c60745631d90a43d54a6d4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 22 May 2018 19:29:51 +0200 Subject: [PATCH 346/680] Fix internal::is_integral with MSVC 2013 and older. --- Eigen/src/Core/util/Meta.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0c7b8e709..6e5af35c0 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -113,6 +113,10 @@ template<> struct is_integral { enum { value = true }; } template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; +#if EIGEN_COMP_MSVC +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +#endif #endif From d06a753d1017255f2a10a5b00a830865c30fe4df Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 22 May 2018 20:29:17 +0200 Subject: [PATCH 347/680] Make qr_fullpivoting unit test run for fixed-sized matrices --- test/qr_fullpivoting.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp index 70e89c198..3d9accea6 100644 --- a/test/qr_fullpivoting.cpp +++ b/test/qr_fullpivoting.cpp @@ -15,11 +15,12 @@ template void qr() { typedef typename MatrixType::Index Index; + static const int Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime; Index max_size = EIGEN_TEST_MAX_SIZE; Index min_size = numext::maxi(1,EIGEN_TEST_MAX_SIZE/10); - Index rows = internal::random(min_size,max_size), - cols = internal::random(min_size,max_size), - cols2 = internal::random(min_size,max_size), + Index rows = Rows == Dynamic ? internal::random(min_size,max_size) : Rows, + cols = Cols == Dynamic ? internal::random(min_size,max_size) : Cols, + cols2 = Cols == Dynamic ? internal::random(min_size,max_size) : Cols, rank = internal::random(1, (std::min)(rows, cols)-1); typedef typename MatrixType::Scalar Scalar; @@ -128,9 +129,10 @@ template void qr_verify_assert() void test_qr_fullpivoting() { - for(int i = 0; i < 1; i++) { - // FIXME : very weird bug here -// CALL_SUBTEST(qr(Matrix2f()) ); + for(int i = 0; i < 1; i++) { + CALL_SUBTEST_5( qr() ); + CALL_SUBTEST_6( qr() ); + CALL_SUBTEST_8( qr() ); CALL_SUBTEST_1( qr() ); CALL_SUBTEST_2( qr() ); CALL_SUBTEST_3( qr() ); From 750af063629cd366f6d01985b37a21ebc37a5af3 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 22 May 2018 21:04:32 +0200 Subject: [PATCH 348/680] Add an option to test with external BLAS library --- cmake/EigenTesting.cmake | 4 ++++ test/CMakeLists.txt | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 16d6d279f..7d2d63722 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -679,6 +679,10 @@ macro(ei_set_build_string) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS}) endif() + if(EIGEN_TEST_EXTERNAL_BLAS) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas) + endif() + ei_is_64bit_env(IS_64BIT_ENV) if(NOT IS_64BIT_ENV) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f8deafc07..073effd81 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,6 +30,15 @@ if(NOT EIGEN_Fortran_COMPILER_WORKS) find_package(LAPACK QUIET) endif() +# TODO do the same for EXTERNAL_LAPACK +option(EIGEN_TEST_EXTERNAL_BLAS "Use external BLAS library for testsuite" OFF) +if(EIGEN_TEST_EXTERNAL_BLAS) + find_package(BLAS REQUIRED) + message(STATUS "BLAS_COMPILER_FLAGS: ${BLAS_COMPILER_FLAGS}") + add_definitions("-DEIGEN_USE_BLAS") # is adding ${BLAS_COMPILER_FLAGS} necessary? + list(APPEND EXTERNAL_LIBS "${BLAS_LIBRARIES}") +endif(EIGEN_TEST_EXTERNAL_BLAS) + # configure blas/lapack (use Eigen's ones) set(EIGEN_BLAS_LIBRARIES eigen_blas) set(EIGEN_LAPACK_LIBRARIES eigen_lapack) From 49262dfee6785bcf161ee48e619035bc5b30976c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 15:09:31 +0200 Subject: [PATCH 349/680] Fix compilation and SSE support with PGI compiler --- Eigen/src/Core/arch/AVX/PacketMath.h | 6 +++--- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- Eigen/src/Core/arch/SSE/PacketMath.h | 16 +++++++++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 34b89c790..59aebd912 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) } #ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } #endif template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 23e717f28..c618cfaaa 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -324,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 03c8a2c13..d1a7c65be 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -462,9 +462,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& } #ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); } #endif #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 @@ -928,4 +928,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } // end namespace Eigen +#if EIGEN_COMP_PGI +// PGI++ does not define the following intrinsics in C++ mode. +static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } +static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } +static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } +static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } +#endif + #endif // EIGEN_PACKET_MATH_SSE_H From 647b724a364f4492f39f607d7007b7feac838bf8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 20:46:46 +0200 Subject: [PATCH 350/680] Define pcast<> for SSE types even when AVX is enabled. (otherwise float are silently reinterpreted as int instead of being converted) --- Eigen/Core | 1 + Eigen/src/Core/arch/SSE/TypeCasting.h | 28 +++++++++++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index c74340b81..f6bc18a08 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -413,6 +413,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/SSE/TypeCasting.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index c84893230..c6ca8c716 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -14,6 +14,7 @@ namespace Eigen { namespace internal { +#ifndef EIGEN_VECTORIZE_AVX template <> struct type_casting_traits { enum { @@ -23,11 +24,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - - template <> struct type_casting_traits { enum { @@ -37,11 +33,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - - template <> struct type_casting_traits { enum { @@ -51,10 +42,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - template <> struct type_casting_traits { enum { @@ -63,6 +50,19 @@ struct type_casting_traits { TgtCoeffRatio = 2 }; }; +#endif + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { // Simply discard the second half of the input From eef4b7bd8768a79078ba6d4adc9f85b3f5937e39 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 20:49:06 +0200 Subject: [PATCH 351/680] Fix handling of path names containing spaces and the likes. --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 073effd81..12c12b06d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,7 +47,7 @@ set(EIGEN_TEST_MATRIX_DIR "" CACHE STRING "Enable testing of realword sparse mat if(EIGEN_TEST_MATRIX_DIR) if(NOT WIN32) message(STATUS "Test realworld sparse matrices: ${EIGEN_TEST_MATRIX_DIR}") - add_definitions( -DTEST_REAL_CASES="${EIGEN_TEST_MATRIX_DIR}" ) + add_definitions( -DTEST_REAL_CASES=${EIGEN_TEST_MATRIX_DIR} ) else(NOT WIN32) message(STATUS "REAL CASES CAN NOT BE CURRENTLY TESTED ON WIN32") endif(NOT WIN32) From 999b552c16fe48ba6ca2ea0fe8d8788775f3c58d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 20:49:25 +0200 Subject: [PATCH 352/680] Search for sequential Pastix. --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 12c12b06d..e1eef086e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -100,7 +100,7 @@ else() endif() -find_package(PASTIX QUIET COMPONENTS METIS SCOTCH) +find_package(PASTIX QUIET COMPONENTS METIS SEQ) # check that the PASTIX found is a version without MPI find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS NAMES pastix_nompi.h From ea94543190354e9cac55784b9e4f17549a14906f Mon Sep 17 00:00:00 2001 From: Katrin Leinweber Date: Thu, 24 May 2018 18:55:40 +0200 Subject: [PATCH 353/680] Hyperlink DOIs against preferred resolver --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 2 +- unsupported/Eigen/src/IterativeSolvers/DGMRES.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index fb6454623..bf5c532ff 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -21,7 +21,7 @@ namespace Eigen { * \brief Fast integer division by a constant. * * See the paper from Granlund and Montgomery for explanation. - * (at http://dx.doi.org/10.1145/773473.178249) + * (at https://doi.org/10.1145/773473.178249) * * \sa Tensor */ diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h index bae04fc30..d603ba336 100644 --- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h @@ -89,7 +89,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType:: * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid * Algebraic Solvers for Linear Systems Arising from Compressible * Flows, Computers and Fluids, In Press, - * http://dx.doi.org/10.1016/j.compfluid.2012.03.023 + * https://doi.org/10.1016/j.compfluid.2012.03.023 * [2] K. Burrage and J. Erhel, On the performance of various * adaptive preconditioned GMRES strategies, 5(1998), 101-121. * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES From 6af1433cb50af7423a1a69afc24c098af9c76bb1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 29 May 2018 22:37:47 +0200 Subject: [PATCH 354/680] Doc: add aliasing in common pitfaffs. --- doc/Pitfalls.dox | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox index cf42effef..3f395053d 100644 --- a/doc/Pitfalls.dox +++ b/doc/Pitfalls.dox @@ -2,10 +2,16 @@ namespace Eigen { /** \page TopicPitfalls Common pitfalls + \section TopicPitfalls_template_keyword Compilation error with template methods See this \link TopicTemplateKeyword page \endlink. +\section TopicPitfalls_aliasing Aliasing + +Don't miss this \link TopicAliasing page \endlink on aliasing, +especially if you got wrong results in statements where the destination appears on the right hand side of the expression. + \section TopicPitfalls_auto_keyword C++11 and the auto keyword In short: do not use the auto keywords with Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a Matrix<> type. Here is an example: From f216854453887f31ac02ffefb7a7a569dc3fa54d Mon Sep 17 00:00:00 2001 From: Michael Figurnov Date: Thu, 31 May 2018 15:34:53 +0100 Subject: [PATCH 355/680] Exponentially scaled modified Bessel functions of order zero and one. The functions are conventionally called i0e and i1e. The exponentially scaled version is more numerically stable. The standard Bessel functions can be obtained as i0(x) = exp(|x|) i0e(x) The code is ported from Cephes and tested against SciPy. --- Eigen/src/Core/GenericPacketMath.h | 2 + Eigen/src/Core/arch/CUDA/PacketMath.h | 4 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 12 + unsupported/Eigen/SpecialFunctions | 2 + .../SpecialFunctionsArrayAPI.h | 46 ++ .../SpecialFunctionsFunctors.h | 54 +++ .../SpecialFunctions/SpecialFunctionsHalf.h | 8 + .../SpecialFunctions/SpecialFunctionsImpl.h | 409 ++++++++++++++++++ .../SpecialFunctionsPacketMath.h | 12 + .../arch/CUDA/CudaSpecialFunctions.h | 26 ++ unsupported/test/cxx11_tensor_cuda.cu | 116 +++++ unsupported/test/special_functions.cpp | 40 ++ 12 files changed, 731 insertions(+) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 7979c3aff..888a3f7ea 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -82,6 +82,8 @@ struct default_packet_traits HasPolygamma = 0, HasErf = 0, HasErfc = 0, + HasI0e = 0, + HasI1e = 0, HasIGamma = 0, HasIGammac = 0, HasBetaInc = 0, diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 97a8abe59..704a4e0d9 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -44,6 +44,8 @@ template<> struct packet_traits : default_packet_traits HasPolygamma = 1, HasErf = 1, HasErfc = 1, + HasI0e = 1, + HasI1e = 1, HasIGamma = 1, HasIGammac = 1, HasBetaInc = 1, @@ -73,6 +75,8 @@ template<> struct packet_traits : default_packet_traits HasPolygamma = 1, HasErf = 1, HasErfc = 1, + HasI0e = 1, + HasI1e = 1, HasIGamma = 1, HasIGammac = 1, HasBetaInc = 1, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 745b16d2c..a942c98dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -133,6 +133,18 @@ class TensorBase return unaryExpr(internal::scalar_digamma_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + i0e() const { + return unaryExpr(internal::scalar_i0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + i1e() const { + return unaryExpr(internal::scalar_i1e_op()); + } + // igamma(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions index a2ad4925e..482ec6e6f 100644 --- a/unsupported/Eigen/SpecialFunctions +++ b/unsupported/Eigen/SpecialFunctions @@ -34,6 +34,8 @@ namespace Eigen { * - polygamma * - zeta * - betainc + * - i0e + * - i1e * * \code * #include diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h index ed415db99..b7a9d035b 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h @@ -119,6 +119,52 @@ zeta(const Eigen::ArrayBase& x, const Eigen::ArrayBase& q) ); } +/** \returns an expression of the coefficient-wise i0e(\a x) to the given + * arrays. + * + * It returns the exponentially scaled modified Bessel + * function of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of i0e(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::i0e() + */ +template +inline const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_i0e_op, const Derived> +i0e(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_i0e_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise i1e(\a x) to the given + * arrays. + * + * It returns the exponentially scaled modified Bessel + * function of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of i1e(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::i1e() + */ +template +inline const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_i1e_op, const Derived> +i1e(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_i1e_op, + const Derived>(x.derived()); +} + } // end namespace Eigen #endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h index d8f2363be..8420f0174 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h @@ -229,6 +229,60 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the exponentially scaled modified Bessel + * function of order zero + * \sa class CwiseUnaryOp, Cwise::i0e() + */ +template +struct scalar_i0e_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_i0e_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const { + using numext::i0e; + return i0e(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { + return internal::pi0e(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=20 is computed. + // The cost is N multiplications and 2N additions. + Cost = 20 * NumTraits::MulCost + 40 * NumTraits::AddCost, + PacketAccess = packet_traits::HasI0e + }; +}; + +/** \internal + * \brief Template functor to compute the exponentially scaled modified Bessel + * function of order zero + * \sa class CwiseUnaryOp, Cwise::i1e() + */ +template +struct scalar_i1e_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_i1e_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const { + using numext::i1e; + return i1e(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { + return internal::pi1e(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=20 is computed. + // The cost is N multiplications and 2N additions. + Cost = 20 * NumTraits::MulCost + 40 * NumTraits::AddCost, + PacketAccess = packet_traits::HasI1e + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h index 553bcda6a..c5867002e 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h @@ -39,6 +39,14 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) { return Eigen::half(Eigen::numext::betainc(static_cast(a), static_cast(b), static_cast(x))); } +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half i0e(const Eigen::half& x) { + return Eigen::half(Eigen::numext::i0e(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half i1e(const Eigen::half& x) { + return Eigen::half(Eigen::numext::i1e(static_cast(x))); +} #endif } // end namespace numext diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 97b2e5a91..6c7ac3f3b 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -95,6 +95,75 @@ struct polevl { } }; +/* chbevl (modified for Eigen) + * + * Evaluate Chebyshev series + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N], chebevl(); + * + * y = chbevl( x, coef, N ); + * + * + * + * DESCRIPTION: + * + * Evaluates the series + * + * N-1 + * - ' + * y = > coef[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero + * order term is last in the array. Note N is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must + * have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), + * over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in + * which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, + * this becomes x -> 4a/x - 1. + * + * + * + * SPEED: + * + * Taking advantage of the recurrence properties of the + * Chebyshev polynomials, the routine requires one more + * addition per loop than evaluating a nested polynomial of + * the same degree. + * + */ +template +struct chebevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar x, const Scalar coef[]) { + Scalar b0 = coef[0]; + Scalar b1 = 0; + Scalar b2; + + for (int i = 1; i < N; i++) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + coef[i]; + } + + return Scalar(0.5) * (b0 - b2); + } +}; + } // end namespace cephes /**************************************************************************** @@ -1505,6 +1574,334 @@ struct betainc_impl { } }; +/**************************************************************************** + * Implementation of Bessel function, based on Cephes * + ****************************************************************************/ + +template +struct i0e_retval { + typedef Scalar type; +}; + +template +struct i0e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct i0e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { + /* i0ef.c + * + * Modified Bessel function of order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * float x, y, i0ef(); + * + * y = i0ef( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order zero of the argument. + * + * The function is defined as i0e(x) = exp(-|x|) j0( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 100000 3.7e-7 7.0e-8 + * See i0f(). + * + */ + const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f, + -2.67079385394061173391E-7f, 1.11738753912010371815E-6f, + -4.41673835845875056359E-6f, 1.64484480707288970893E-5f, + -5.75419501008210370398E-5f, 1.88502885095841655729E-4f, + -5.76375574538582365885E-4f, 1.63947561694133579842E-3f, + -4.32430999505057594430E-3f, 1.05464603945949983183E-2f, + -2.37374148058994688156E-2f, 4.93052842396707084878E-2f, + -9.49010970480476444210E-2f, 1.71620901522208775349E-1f, + -3.04682672343198398683E-1f, 6.76795274409476084995E-1f}; + + const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f, + 2.04891858946906374183E-7f, 2.89137052083475648297E-6f, + 6.88975834691682398426E-5f, 3.36911647825569408990E-3f, + 8.04490411014108831608E-1f}; + if (x < 0.0f) { + x = -x; + } + + if (x <= 8.0f) { + float y = 0.5f * x - 2.0f; + return cephes::chebevl::run(y, A); + } + + return cephes::chebevl::run(32.0f / x - 2.0f, B) / numext::sqrt(x); + } +}; + +template <> +struct i0e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { + /* i0e.c + * + * Modified Bessel function of order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, i0e(); + * + * y = i0e( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order zero of the argument. + * + * The function is defined as i0e(x) = exp(-|x|) j0( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 5.4e-16 1.2e-16 + * See i0(). + * + */ + const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + const double B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + if (x < 0.0) { + x = -x; + } + + if (x <= 8.0) { + double y = (x / 2.0) - 2.0; + return cephes::chebevl::run(y, A); + } + + return cephes::chebevl::run(32.0 / x - 2.0, B) / + numext::sqrt(x); + } +}; + +template +struct i1e_retval { + typedef Scalar type; +}; + +template +struct i1e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct i1e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { + /* i1ef.c + * + * Modified Bessel function of order one, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * float x, y, i1ef(); + * + * y = i1ef( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order one of the argument. + * + * The function is defined as i1(x) = -i exp(-|x|) j1( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 1.5e-6 1.5e-7 + * See i1(). + * + */ + const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + + const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + float z = numext::abs(x); + + if (z <= 8.0f) { + float y = 0.5f * z - 2.0f; + z = cephes::chebevl::run(y, A) * z; + } else { + z = cephes::chebevl::run(32.0f / z - 2.0f, B) / numext::sqrt(z); + } + + if (x < 0.0f) { + z = -z; + } + + return z; + } +}; + +template <> +struct i1e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { + /* i1e.c + * + * Modified Bessel function of order one, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, i1e(); + * + * y = i1e( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order one of the argument. + * + * The function is defined as i1(x) = -i exp(-|x|) j1( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 2.0e-15 2.0e-16 + * See i1(). + * + */ + const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const double B[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + double z = numext::abs(x); + + if (z <= 8.0) { + double y = (z / 2.0) - 2.0; + z = cephes::chebevl::run(y, A) * z; + } else { + z = cephes::chebevl::run(32.0 / z - 2.0, B) / numext::sqrt(z); + } + + if (x < 0.0) { + z = -z; + } + + return z; + } +}; + #endif // EIGEN_HAS_C99_MATH } // end namespace internal @@ -1565,6 +1962,18 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar) return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x); } +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(i0e, Scalar) + i0e(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(i0e, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(i1e, Scalar) + i1e(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(i1e, Scalar)::run(x); +} + } // end namespace numext diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h index 46d60d323..4c176716b 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h @@ -50,6 +50,18 @@ Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; retur template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); } +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero i0e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pi0e(const Packet& x) { using numext::i0e; return i0e(x); } + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one i1e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pi1e(const Packet& x) { using numext::i1e; return i1e(x); } + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h index e0e3a8be6..c25fea0b3 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -156,6 +156,32 @@ double2 pbetainc(const double2& a, const double2& b, const double2& x) return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y)); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pi0e(const float4& x) { + using numext::i0e; + return make_float4(i0e(x.x), i0e(x.y), i0e(x.z), i0e(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pi0e(const double2& x) { + using numext::i0e; + return make_double2(i0e(x.x), i0e(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pi1e(const float4& x) { + using numext::i1e; + return make_float4(i1e(x.x), i1e(x.y), i1e(x.z), i1e(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pi1e(const double2& x) { + using numext::i1e; + return make_double2(i1e(x.x), i1e(x.y)); +} + #endif } // end namespace internal diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 9584a539f..63d0a345a 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -1208,6 +1208,116 @@ void test_cuda_betainc() cudaFree(d_out); } +template +void test_cuda_i0e() +{ + Tensor in_x(21); + Tensor out(21); + Tensor expected_out(21); + out.setZero(); + + Array in_x_array(21); + Array expected_out_array(21); + + in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, + -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361, + 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857, + 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554, + 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163, + 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128, + 0.0897803118848; + + for (int i = 0; i < 21; ++i) { + in_x(i) = in_x_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in_x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 21); + Eigen::TensorMap > gpu_out(d_out, 21); + + gpu_out.device(gpu_device) = gpu_in.i0e(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 21; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_in); + cudaFree(d_out); +} + +template +void test_cuda_i1e() +{ + Tensor in_x(21); + Tensor out(21); + Tensor expected_out(21); + out.setZero(); + + Array in_x_array(21); + Array expected_out_array(21); + + in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, + -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565, + -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293, + -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249, + 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384, + 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872, + 0.0875062221833; + + for (int i = 0; i < 21; ++i) { + in_x(i) = in_x_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + cudaMalloc((void**)(&d_in), bytes); + cudaMalloc((void**)(&d_out), bytes); + + cudaMemcpy(d_in, in_x.data(), bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, 21); + Eigen::TensorMap > gpu_out(d_out, 21); + + gpu_out.device(gpu_device) = gpu_in.i1e(); + + assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 21; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + cudaFree(d_in); + cudaFree(d_out); +} + void test_cxx11_tensor_cuda() { @@ -1280,5 +1390,11 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_6(test_cuda_betainc()); CALL_SUBTEST_6(test_cuda_betainc()); + + CALL_SUBTEST_6(test_cuda_i0e()); + CALL_SUBTEST_6(test_cuda_i0e()); + + CALL_SUBTEST_6(test_cuda_i1e()); + CALL_SUBTEST_6(test_cuda_i1e()); #endif } diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp index 057fb3e92..48d0db95e 100644 --- a/unsupported/test/special_functions.cpp +++ b/unsupported/test/special_functions.cpp @@ -335,6 +335,46 @@ template void array_special_functions() ArrayType test = betainc(a, b + one, x) + eps; verify_component_wise(test, expected);); } + + // Test Bessel function i0e. Reference results obtained with SciPy. + { + ArrayType x(21); + ArrayType expected(21); + ArrayType res(21); + + x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected << 0.0897803118848, 0.0947062952128, 0.100544127361, + 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857, + 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554, + 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163, + 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128, + 0.0897803118848; + + CALL_SUBTEST(res = i0e(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function i1e. Reference results obtained with SciPy. + { + ArrayType x(21); + ArrayType expected(21); + ArrayType res(21); + + x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected << -0.0875062221833, -0.092036796872, -0.0973496147565, + -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293, + -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249, + 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384, + 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872, + 0.0875062221833; + + CALL_SUBTEST(res = i1e(x); + verify_component_wise(res, expected);); + } #endif } From 84868da904ac8d07342983b9bb78cf3360142363 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 31 May 2018 21:21:57 +0200 Subject: [PATCH 356/680] Don't run hg on non mercurial clone --- CMakeLists.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5245b2930..91545bdb0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,10 +41,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_ set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}") set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}) -# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty, -# but won't stop CMake. -execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) -execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +# if we are not in a mercurial clone +if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg) + # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty, + # but won't stop CMake. + execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) + execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +endif() # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output... if(EIGEN_BRANCH_OUTPUT MATCHES "default") From e2ed0cf8abfe275bd147f3228e4c59f5242993f6 Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool Date: Sat, 2 Jun 2018 12:07:49 -0700 Subject: [PATCH 357/680] Add a ThreadPoolInterface* getter for ThreadPoolDevice. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index ec6802e85..ca9ba402e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -169,7 +169,7 @@ struct ThreadPoolDevice { // parallelFor executes f with [0, n) arguments in parallel and waits for // completion. F accepts a half-open interval [first, last). - // Block size is choosen based on the iteration cost and resulting parallel + // Block size is chosen based on the iteration cost and resulting parallel // efficiency. If block_align is not nullptr, it is called to round up the // block size. void parallelFor(Index n, const TensorOpCost& cost, @@ -261,6 +261,9 @@ struct ThreadPoolDevice { parallelFor(n, cost, nullptr, std::move(f)); } + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + private: ThreadPoolInterface* pool_; int num_threads_; From 8fbd47052bcafea612b8ae2841c1de5db738f042 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 6 Jun 2018 10:12:58 -0400 Subject: [PATCH 358/680] Adding support for using Eigen in HIP kernels. This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests. --- Eigen/Core | 69 +- Eigen/src/Core/GenericPacketMath.h | 8 +- Eigen/src/Core/MathFunctions.h | 61 +- Eigen/src/Core/ProductEvaluators.h | 6 + Eigen/src/Core/arch/HIP/hcc/Half.h | 705 ++++++++ Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h | 1019 +++++++++++ Eigen/src/Core/arch/HIP/hcc/TypeCasting.h | 212 +++ Eigen/src/Core/arch/HIP/hcc/math_constants.h | 23 + Eigen/src/Core/functors/BinaryFunctors.h | 6 + Eigen/src/Core/util/BlasUtil.h | 5 +- Eigen/src/Core/util/Macros.h | 5 +- Eigen/src/Core/util/Memory.h | 38 + Eigen/src/Core/util/Meta.h | 52 +- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 1 + Eigen/src/SVD/BDCSVD.h | 2 +- cmake/EigenTesting.cmake | 9 +- test/CMakeLists.txt | 42 + test/hip_basic.cu | 172 ++ test/hip_common.h | 103 ++ test/main.h | 22 +- unsupported/Eigen/CXX11/Tensor | 40 +- .../CXX11/src/Tensor/TensorContraction.h | 10 +- .../src/Tensor/TensorContractionBlocking.h | 5 +- .../CXX11/src/Tensor/TensorContractionHip.h | 1521 +++++++++++++++++ .../CXX11/src/Tensor/TensorConvolutionHip.h | 1119 ++++++++++++ .../CXX11/src/Tensor/TensorDeviceDefault.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorDeviceHip.h | 352 ++++ .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 24 +- .../CXX11/src/Tensor/TensorReductionHip.h | 815 +++++++++ .../Eigen/CXX11/src/Tensor/TensorScan.h | 9 +- unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 12 + .../Eigen/CXX11/src/util/EmulateArray.h | 2 +- .../SpecialFunctions/SpecialFunctionsImpl.h | 4 +- unsupported/test/CMakeLists.txt | 52 + unsupported/test/cxx11_tensor_argmax_hip.cu | 251 +++ .../test/cxx11_tensor_cast_float16_hip.cu | 79 + unsupported/test/cxx11_tensor_contract_hip.cu | 215 +++ unsupported/test/cxx11_tensor_device_hip.cu | 389 +++++ unsupported/test/cxx11_tensor_hip.cu | 1296 ++++++++++++++ .../test/cxx11_tensor_of_float16_hip.cu | 498 ++++++ unsupported/test/cxx11_tensor_random_hip.cu | 85 + .../test/cxx11_tensor_reduction_hip.cu | 154 ++ unsupported/test/cxx11_tensor_scan_hip.cu | 76 + 50 files changed, 9527 insertions(+), 94 deletions(-) create mode 100644 Eigen/src/Core/arch/HIP/hcc/Half.h create mode 100644 Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h create mode 100644 Eigen/src/Core/arch/HIP/hcc/TypeCasting.h create mode 100644 Eigen/src/Core/arch/HIP/hcc/math_constants.h create mode 100644 test/hip_basic.cu create mode 100644 test/hip_common.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h create mode 100644 unsupported/test/cxx11_tensor_argmax_hip.cu create mode 100644 unsupported/test/cxx11_tensor_cast_float16_hip.cu create mode 100644 unsupported/test/cxx11_tensor_contract_hip.cu create mode 100644 unsupported/test/cxx11_tensor_device_hip.cu create mode 100644 unsupported/test/cxx11_tensor_hip.cu create mode 100644 unsupported/test/cxx11_tensor_of_float16_hip.cu create mode 100644 unsupported/test/cxx11_tensor_random_hip.cu create mode 100644 unsupported/test/cxx11_tensor_reduction_hip.cu create mode 100644 unsupported/test/cxx11_tensor_scan_hip.cu diff --git a/Eigen/Core b/Eigen/Core index f6bc18a08..c72d5468a 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,17 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDACC, but for HIP + #define EIGEN_HIPCC __HIPCC__ +#endif + +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." +#endif + // Starting with CUDA 9 the composite __CUDACC_VER__ is not available. #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) @@ -32,8 +43,8 @@ #endif // Handle NVCC/CUDA/SYCL -#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! +#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC) + // Do not try asserts on CUDA, HIP and SYCL! #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG #endif @@ -57,6 +68,26 @@ // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side #include + + #elif defined(EIGEN_HIPCC) + // Do not try to vectorize on HIP + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include + + #if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + // Note this check needs to come after we include hip_runtime.h since + // hip_runtime.h includes hip_common.h which in turn has the define + // for __HIP_DEVICE_COMPILE__ + #endif + #else #define EIGEN_DEVICE_FUNC #endif @@ -68,16 +99,16 @@ #define EIGEN_DONT_VECTORIZE #endif -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__) +// When compiling CUDA device code with NVCC, or HIP device code with HIPCC +// pull in math functions from the global namespace. In host mode, and when +// device doee with clang, use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__)) #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #else #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_EXCEPTIONS #endif @@ -270,6 +301,17 @@ #include #endif +#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_HAS_HIP_FP16 + #include + #define HIP_PATCH_WITH_NEW_FP16 18215 + #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16) + #define EIGEN_HAS_OLD_HIP_FP16 + // Old HIP implementation does not have a explicit typedef for "half2" + typedef __half2 half2; + #endif +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -390,7 +432,6 @@ using std::ptrdiff_t; #include "src/Core/util/IntegralConstant.h" #include "src/Core/util/SymbolicIndex.h" - #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" @@ -434,9 +475,15 @@ using std::ptrdiff_t; #endif // Half float support -#include "src/Core/arch/CUDA/Half.h" -#include "src/Core/arch/CUDA/PacketMathHalf.h" -#include "src/Core/arch/CUDA/TypeCasting.h" +#if defined EIGEN_USE_HIP + #include "src/Core/arch/HIP/hcc/Half.h" + #include "src/Core/arch/HIP/hcc/PacketMathHalf.h" + #include "src/Core/arch/HIP/hcc/TypeCasting.h" +#else + #include "src/Core/arch/CUDA/Half.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" + #include "src/Core/arch/CUDA/TypeCasting.h" +#endif #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 888a3f7ea..0903c3a6e 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -299,7 +299,11 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) +template + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + inline void prefetch(const Scalar* addr) { #ifdef EIGEN_CUDA_ARCH #if defined(__LP64__) @@ -528,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 05462c5e1..6beef5def 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template struct imag_impl > { @@ -260,7 +260,7 @@ struct conj_default_impl template struct conj_impl : conj_default_impl {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template struct conj_impl > { @@ -435,7 +435,12 @@ struct round_retval struct arg_impl { static inline Scalar run(const Scalar& x) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else EIGEN_USING_STD_MATH(arg); + #endif return arg(x); } }; @@ -768,7 +773,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isfinite(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -783,7 +790,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isinf(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -798,7 +807,9 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_HIP_DEVICE_COMPILE) + return isnan(x); + #elif defined(EIGEN_CUDA_ARCH) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -864,7 +875,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1078,7 +1089,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1136,7 +1147,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1157,7 +1168,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1215,7 +1226,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1243,7 +1254,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1273,7 +1284,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1309,7 +1320,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1329,7 +1340,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1349,7 +1360,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1369,7 +1380,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1400,7 +1411,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1431,7 +1442,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1462,7 +1473,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1483,7 +1494,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1503,7 +1514,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1521,12 +1532,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1546,7 +1557,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index fb637191d..cc75fbce3 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -137,6 +137,9 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { @@ -390,6 +393,9 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); diff --git a/Eigen/src/Core/arch/HIP/hcc/Half.h b/Eigen/src/Core/arch/HIP/hcc/Half.h new file mode 100644 index 000000000..2ce8a412c --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/Half.h @@ -0,0 +1,705 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting from HIP's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_HIP_H +#define EIGEN_HALF_HIP_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + +namespace Eigen { + +struct half; + +namespace half_impl { + +#if !defined(EIGEN_HAS_HIP_FP16) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + unsigned short x; +}; +#elif defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add a implcit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC half_base() {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + #if !defined(EIGEN_HAS_HIP_FP16) || defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif + + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif + + explicit EIGEN_DEVICE_FUNC half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template + explicit EIGEN_DEVICE_FUNC half(const T& val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return half_impl::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast(half_impl::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + +namespace half_impl { + +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs mostly working through conversion to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast(a) / static_cast(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + __half tmp_ff = __float2half(ff); + #if defined(EIGEN_HAS_OLD_HIP_FP16) + __half_raw h; + h.data = tmp_ff; + return h; + #else + return *(__half_raw*)&tmp_ff; + #endif + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + + o.x |= static_cast(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast(v); + return os; +} + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7c01); + } +}; + +} // end namespace Eigen + +// C-like standard mathematical functions and trancendentals. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return Eigen::half(hlog(a)); +#else + return Eigen::half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast(a.x); + } +}; +#endif + +} // end namespace std + + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_HAS_HIP_FP16) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + // FIXME + //return static_cast(__shfl_xor(static_cast(var), laneMask, width)); + return var; +} +#endif + +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(EIGEN_HAS_HIP_FP16) && \ + defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) && defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__) +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + // FIXME + //return Eigen::half_impl::raw_uint16_to_half( + // __ldg(reinterpret_cast(ptr))); + return *ptr; +} +#endif + + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +} // namespace Eigen +} // namespace numext +#endif + +#endif // EIGEN_HALF_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h new file mode 100644 index 000000000..29c3f4671 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/PacketMathHalf.h @@ -0,0 +1,1019 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_HIP_H +#define EIGEN_PACKET_MATH_HALF_HIP_H + + +namespace Eigen { +namespace internal { + +// Most of the following operations require arch >= 3.0 +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasAdd = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else + return __half2half2(from); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> + __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif +} + +template<> +__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { + return __low2half(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { + __half x = __ushort_as_half(__half_as_ushort(__low2half(a)) & 0x7FFF); + __half y = __ushort_as_half(__half_as_ushort(__high2half(a)) & 0x7FFF); + return __halves2half2(x, y); +} + + +__device__ EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { + return __hmul2(a, b); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 plog(const half2& a) { + return h2log(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 pexp(const half2& a) { + return h2exp(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 psqrt(const half2& a) { + return h2sqrt(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 prsqrt(const half2& a) { + return h2rsqrt(a); +} + +#elif defined EIGEN_VECTORIZE_AVX512 + +typedef struct { + __m256i x; +} Packet16h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; + +template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { + Packet16h result; + result.x = _mm256_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_load_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_loadu_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { + _mm256_store_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { + _mm256_storeu_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a.x); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet16h result; + result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + Packet16h result; + result.x = _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) +{ + Packet16h result; + result.x = _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; + to[stride*8].x = aux[8].x; + to[stride*9].x = aux[9].x; + to[stride*10].x = aux[10].x; + to[stride*11].x = aux[11].x; + to[stride*12].x = aux[12].x; + to[stride*13].x = aux[13].x; + to[stride*14].x = aux[14].x; + to[stride*15].x = aux[15].x; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0].x; + __m256i b = kernel.packet[1].x; + __m256i c = kernel.packet[2].x; + __m256i d = kernel.packet[3].x; + __m256i e = kernel.packet[4].x; + __m256i f = kernel.packet[5].x; + __m256i g = kernel.packet[6].x; + __m256i h = kernel.packet[7].x; + __m256i i = kernel.packet[8].x; + __m256i j = kernel.packet[9].x; + __m256i k = kernel.packet[10].x; + __m256i l = kernel.packet[11].x; + __m256i m = kernel.packet[12].x; + __m256i n = kernel.packet[13].x; + __m256i o = kernel.packet[14].x; + __m256i p = kernel.packet[15].x; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0].x = a_p_0; + kernel.packet[1].x = a_p_1; + kernel.packet[2].x = a_p_2; + kernel.packet[3].x = a_p_3; + kernel.packet[4].x = a_p_4; + kernel.packet[5].x = a_p_5; + kernel.packet[6].x = a_p_6; + kernel.packet[7].x = a_p_7; + kernel.packet[8].x = a_p_8; + kernel.packet[9].x = a_p_9; + kernel.packet[10].x = a_p_a; + kernel.packet[11].x = a_p_b; + kernel.packet[12].x = a_p_c; + kernel.packet[13].x = a_p_d; + kernel.packet[14].x = a_p_e; + kernel.packet[15].x = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + pstore(in[4], kernel.packet[4]); + pstore(in[5], kernel.packet[5]); + pstore(in[6], kernel.packet[6]); + pstore(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); + kernel.packet[4] = pload(out[4]); + kernel.packet[5] = pload(out[5]); + kernel.packet[6] = pload(out[6]); + kernel.packet[7] = pload(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + +#elif defined EIGEN_VECTORIZE_AVX + +typedef struct { + __m128i x; +} Packet8h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + Packet8h result; + result.x = _mm_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + Packet8h result; + result.x = _mm_load_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + Packet8h result; + result.x = _mm_loadu_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a.x); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet8h result; + result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); + + Packet8h result; + result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) +{ + Packet8h result; + result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0].x; + __m128i b = kernel.packet[1].x; + __m128i c = kernel.packet[2].x; + __m128i d = kernel.packet[3].x; + __m128i e = kernel.packet[4].x; + __m128i f = kernel.packet[5].x; + __m128i g = kernel.packet[6].x; + __m128i h = kernel.packet[7].x; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0].x = a0b0c0d0e0f0g0h0; + kernel.packet[1].x = a1b1c1d1e1f1g1h1; + kernel.packet[2].x = a2b2c2d2e2f2g2h2; + kernel.packet[3].x = a3b3c3d3e3f3g3h3; + kernel.packet[4].x = a4b4c4d4e4f4g4h4; + kernel.packet[5].x = a5b5c5d5e5f5g5h5; + kernel.packet[6].x = a6b6c6d6e6f6g6h6; + kernel.packet[7].x = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + +} +} + +#endif // EIGEN_PACKET_MATH_HALF_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h new file mode 100644 index 000000000..915266a9d --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/TypeCasting.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_HIP_H +#define EIGEN_TYPE_CASTING_HIP_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + + +#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE) + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#elif defined EIGEN_VECTORIZE_AVX512 +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { + return float2half(a); +} + +#elif defined EIGEN_VECTORIZE_AVX + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_HIP_H diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 000000000..25375a0a4 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 3eae6b8ca..e269140bd 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -436,6 +436,9 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC explicit + #endif bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -455,6 +458,9 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; + #if defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC explicit + #endif bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index b1791fb3a..a4cde6d95 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -163,7 +163,10 @@ class BlasLinearMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9c68ecb7d..c6e27f6af 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1008,9 +1008,12 @@ namespace Eigen { # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef EIGEN_CUDA_ARCH +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 53300c388..87fcc30f5 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -70,7 +70,20 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // + new int[huge]; + #else ::operator new(huge); + #endif #endif } @@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + result = aligned_malloc(size); + #else result = std::malloc(size); + #endif + #if EIGEN_DEFAULT_ALIGN_BYTES==16 eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); #endif @@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + aligned_free(ptr); + #else std::free(ptr); + #endif + #else handmade_aligned_free(ptr); #endif @@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + void *result = aligned_malloc(size); + #else void *result = std::malloc(size); + #endif + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +262,11 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + aligned_free(ptr); + #else std::free(ptr); + #endif } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -493,7 +527,11 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::memcpy(target, start, size); + #else std::memcpy(target, start, size); + #endif } }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6e5af35c0..ca6fa6ce9 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -16,6 +16,12 @@ #include #endif +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + + #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #include #endif @@ -175,7 +181,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -197,13 +203,31 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { @@ -214,9 +238,21 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { @@ -529,13 +565,13 @@ template struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) using internal::device::numeric_limits; #else using std::numeric_limits; diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 040f8d3bb..bf28edc0e 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -354,6 +354,7 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: + EIGEN_DEVICE_FUNC static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index a24deb96a..e977b9623 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1299,7 +1299,7 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 7d2d63722..f818ae840 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,7 +19,9 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - if(EIGEN_TEST_CUDA_CLANG) + if(EIGEN_TEST_HIP) + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP") + elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) if(CUDA_64_BIT_DEVICE_CODE) link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") @@ -491,6 +493,11 @@ macro(ei_testing_print_summary) else() message(STATUS "CUDA: OFF") endif() + if(EIGEN_TEST_HIP) + message(STATUS "HIP: ON (using hipcc)") + else() + message(STATUS "HIP: OFF") + endif() endif() # vectorization / alignment options diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e1eef086e..4a5c1d36d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -407,6 +407,48 @@ endif(CUDA_FOUND) endif(EIGEN_TEST_CUDA) +# HIP unit tests +option(EIGEN_TEST_HIP "Add HIP support." OFF) +if (EIGEN_TEST_HIP) + + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") + + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if (${HIP_PLATFORM} STREQUAL "hcc") + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(hip_basic) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif (${HIP_PLATFORM} STREQUAL "nvcc") + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + + endif(HIP_FOUND) + + else () + + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") + + endif() + +endif(EIGEN_TEST_HIP) + + + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) diff --git a/test/hip_basic.cu b/test/hip_basic.cu new file mode 100644 index 000000000..2e1bf94a4 --- /dev/null +++ b/test/hip_basic.cu @@ -0,0 +1,172 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015-2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// workaround issue between gcc >= 4.7 and cuda 5.5 +#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7) + #undef _GLIBCXX_ATOMIC_BUILTINS + #undef _GLIBCXX_USE_INT128 +#endif + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC hip_basic +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include + +#include "main.h" +#include "hip_common.h" + +// Check that dense modules can be properly parsed by hipcc +#include + +// struct Foo{ +// EIGEN_DEVICE_FUNC +// void operator()(int i, const float* mats, float* vecs) const { +// using namespace Eigen; +// // Matrix3f M(data); +// // Vector3f x(data+9); +// // Map(data+9) = M.inverse() * x; +// Matrix3f M(mats+i/16); +// Vector3f x(vecs+i*3); +// // using std::min; +// // using std::sqrt; +// Map(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x(); +// //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum(); +// } +// }; + +template +struct coeff_wise { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T x1(in+i); + T x2(in+i+1); + T x3(in+i+2); + Map res(out+i*T::MaxSizeAtCompileTime); + + res.array() += (in[0] * x1 + x2).array() * x3.array(); + } +}; + +template +struct replicate { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T x1(in+i); + int step = x1.size() * 4; + int stride = 3 * step; + + typedef Map > MapType; + MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2); + MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3); + MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3); + } +}; + +template +struct redux { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + int N = 10; + T x1(in+i); + out[i*N+0] = x1.minCoeff(); + out[i*N+1] = x1.maxCoeff(); + out[i*N+2] = x1.sum(); + out[i*N+3] = x1.prod(); + out[i*N+4] = x1.matrix().squaredNorm(); + out[i*N+5] = x1.matrix().norm(); + out[i*N+6] = x1.colwise().sum().maxCoeff(); + out[i*N+7] = x1.rowwise().maxCoeff().sum(); + out[i*N+8] = x1.matrix().colwise().squaredNorm().sum(); + } +}; + +template +struct prod_test { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const + { + using namespace Eigen; + typedef Matrix T3; + T1 x1(in+i); + T2 x2(in+i+1); + Map res(out+i*T3::MaxSizeAtCompileTime); + res += in[i] * x1 * x2; + } +}; + +template +struct diagonal { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const + { + using namespace Eigen; + T1 x1(in+i); + Map res(out+i*T2::MaxSizeAtCompileTime); + res += x1.diagonal(); + } +}; + +template +struct eigenvalues { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef Matrix Vec; + T M(in+i); + Map res(out+i*Vec::MaxSizeAtCompileTime); + T A = M*M.adjoint(); + SelfAdjointEigenSolver eig; + eig.computeDirect(M); + res = eig.eigenvalues(); + } +}; + +void test_hip_basic() +{ + ei_test_init_hip(); + + int nthreads = 100; + Eigen::VectorXf in, out; + + #ifndef __HIP_DEVICE_COMPILE__ + int data_size = nthreads * 512; + in.setRandom(data_size); + out.setRandom(data_size); + #endif + + CALL_SUBTEST( run_and_compare_to_hip(coeff_wise(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(coeff_wise(), nthreads, in, out) ); + + // FIXME compile fails when we uncomment the followig two tests + // CALL_SUBTEST( run_and_compare_to_hip(replicate(), nthreads, in, out) ); + // CALL_SUBTEST( run_and_compare_to_hip(replicate(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(redux(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(redux(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(prod_test(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(prod_test(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_hip(diagonal(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_hip(diagonal(), nthreads, in, out) ); + + // FIXME : Runtime failure occurs when we uncomment the following two tests + // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues(), nthreads, in, out) ); + // CALL_SUBTEST( run_and_compare_to_hip(eigenvalues(), nthreads, in, out) ); + +} diff --git a/test/hip_common.h b/test/hip_common.h new file mode 100644 index 000000000..251585c52 --- /dev/null +++ b/test/hip_common.h @@ -0,0 +1,103 @@ + +#ifndef EIGEN_TEST_HIP_COMMON_H +#define EIGEN_TEST_HIP_COMMON_H + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include + +#ifndef __HIPCC__ +dim3 threadIdx, blockDim, blockIdx; +#endif + +template +void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) +{ + for(int i=0; i +__global__ __attribute__((used)) +void run_on_hip_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) +{ + int i = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x; + if(i +void run_on_hip(const Kernel& ker, int n, const Input& in, Output& out) +{ + typename Input::Scalar* d_in; + typename Output::Scalar* d_out; + std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); + std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); + + hipMalloc((void**)(&d_in), in_bytes); + hipMalloc((void**)(&d_out), out_bytes); + + hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice); + hipMemcpy(d_out, out.data(), out_bytes, hipMemcpyHostToDevice); + + // Simple and non-optimal 1D mapping assuming n is not too large + // That's only for unit testing! + dim3 Blocks(128); + dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); + + hipDeviceSynchronize(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_hip_meta_kernel::type, + typename std::decay::type>), + dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out); + hipDeviceSynchronize(); + + // check inputs have not been modified + hipMemcpy(const_cast(in.data()), d_in, in_bytes, hipMemcpyDeviceToHost); + hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost); + + hipFree(d_in); + hipFree(d_out); +} + + +template +void run_and_compare_to_hip(const Kernel& ker, int n, const Input& in, Output& out) +{ + Input in_ref, in_hip; + Output out_ref, out_hip; + #ifndef __HIP_DEVICE_COMPILE__ + in_ref = in_hip = in; + out_ref = out_hip = out; + #endif + run_on_cpu (ker, n, in_ref, out_ref); + run_on_hip(ker, n, in_hip, out_hip); + #ifndef __HIP_DEVICE_COMPILE__ + VERIFY_IS_APPROX(in_ref, in_hip); + VERIFY_IS_APPROX(out_ref, out_hip); + #endif +} + + +void ei_test_init_hip() +{ + int device = 0; + hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp, device); + std::cout << "HIP device info:\n"; + std::cout << " name: " << deviceProp.name << "\n"; + std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; + std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; + std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; + std::cout << " warpSize: " << deviceProp.warpSize << "\n"; + std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; + std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; + std::cout << " clockRate: " << deviceProp.clockRate << "\n"; + std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; + std::cout << " computeMode: " << deviceProp.computeMode << "\n"; +} + +#endif // EIGEN_TEST_HIP_COMMON_H diff --git a/test/main.h b/test/main.h index 0fcd6cb76..79717a532 100644 --- a/test/main.h +++ b/test/main.h @@ -67,11 +67,17 @@ // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses -#define isnan(X) please_protect_your_isnan_with_parentheses -#define isinf(X) please_protect_your_isinf_with_parentheses -#define isfinite(X) please_protect_your_isfinite_with_parentheses +#if !defined(__HIPCC__) + // HIP headers include the header which contains not-parenthesized + // calls to "max", triggering the following check and causing the compile to fail + // so disabling the following checks for HIP + #define min(A,B) please_protect_your_min_with_parentheses + #define max(A,B) please_protect_your_max_with_parentheses + #define isnan(X) please_protect_your_isnan_with_parentheses + #define isinf(X) please_protect_your_isinf_with_parentheses + #define isfinite(X) please_protect_your_isfinite_with_parentheses +#endif + #ifdef M_PI #undef M_PI #endif @@ -154,7 +160,7 @@ namespace Eigen #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "") -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) #define EIGEN_EXCEPTIONS #endif @@ -233,7 +239,7 @@ namespace Eigen } #endif //EIGEN_EXCEPTIONS - #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS + #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3 #define eigen_assert(a) \ if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ @@ -290,7 +296,7 @@ namespace Eigen std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; #endif - #if !defined(__CUDACC__) + #if !defined(__CUDACC__) && !defined(__HIPCC__) #define EIGEN_USE_CUSTOM_ASSERT #endif diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d243fe035..4b7c7d724 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t; #endif #ifdef EIGEN_USE_GPU -#include -#include -#if __cplusplus >= 201103L -#include -#include -#endif + #include + #if defined(EIGEN_USE_HIP) + #include + #else + #include + #endif + #if __cplusplus >= 201103L + #include + #include + #endif #endif #include "src/Tensor/TensorMacros.h" @@ -95,7 +99,11 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorDeviceHip.h" +#else + #include "src/Tensor/TensorDeviceCuda.h" +#endif #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -112,16 +120,28 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorReductionHip.h" +#else + #include "src/Tensor/TensorReductionCuda.h" +#endif #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorContractionHip.h" +#else + #include "src/Tensor/TensorContractionCuda.h" +#endif #include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" +#if defined(EIGEN_USE_HIP) + #include "src/Tensor/TensorConvolutionHip.h" +#else + #include "src/Tensor/TensorConvolution.h" +#endif #include "src/Tensor/TensorFFT.h" #include "src/Tensor/TensorPatch.h" #include "src/Tensor/TensorImagePatch.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e72ddb4a9..979fcf4d9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase } template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase } template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemm(Scalar* buffer) const { #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) if (m_can_use_xsmm) { evalGemmXSMM(buffer); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index d34f9caee..4853dd37b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -28,7 +28,10 @@ class TensorContractionBlocking { typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h new file mode 100644 index 000000000..7561846a3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h @@ -0,0 +1,1521 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in hipThreadIdx_x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z; + const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // hipThreadIdx_x: the vertical position in an 8x8 block + // hipThreadIdx_y: the vertical index of the 8x8 block in the grid + // hipThreadIdx_z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y]; + const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (hipThreadIdx_x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8); + + if (hipThreadIdx_x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0]; + Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1]; + Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2]; + Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3]; + Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4]; + Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5]; + Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6]; + Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7]; + + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j]; + output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +__launch_bounds__(512, 1) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + /*reg = lhs.template loadPacket(row, col);*/ \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + /*reg =lhs.template loadPacket(row, col);*/ \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + reg.w =lhs(row + 3, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+hipThreadIdx_x*4; + + for (Index k = 0; k < k_size; k += 16) { + //lhs_pf0 = internal::pset1(0); + //rhs_pf0 = internal::pset1(0); + lhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf0 = make_float4(0, 0, 0, 0); + + Index lhs_horiz = hipThreadIdx_y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(hipThreadIdx_x%4)*4; + Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((hipThreadIdx_x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((hipThreadIdx_x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x]; + float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x]; + + int start_feature = hipThreadIdx_y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = hipThreadIdx_y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + /*lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0);*/ + + + lhs_pf0 = make_float4(0, 0, 0, 0); + lhs_pf1 = make_float4(0, 0, 0, 0); + lhs_pf2 = make_float4(0, 0, 0, 0); + lhs_pf3 = make_float4(0, 0, 0, 0); + + rhs_pf0 = make_float4(0, 0, 0, 0); + rhs_pf1 = make_float4(0, 0, 0, 0); + + if (!CHECK_LHS_BOUNDARY) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + //lhs_pf3 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+24)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + //lhs_pf3 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+24)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((hipThreadIdx_y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); + } else if ((hipThreadIdx_y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); + } else if ((hipThreadIdx_y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); + } else if ((hipThreadIdx_y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+hipThreadIdx_x*4; + Index rhs_horiz0 = hipThreadIdx_y*2+base_n; + Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + //rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + //rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+hipThreadIdx_x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+hipThreadIdx_x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; + + // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8. + int start_feature = (hipThreadIdx_y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (hipThreadIdx_y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +__launch_bounds__(256, 1) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +__launch_bounds__(256, 1) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = hipBlockIdx_x; + const Index n_block_idx = hipBlockIdx_y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + // We need to redefine this method to make hipcc happy + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel), + dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setHipSharedMemConfig(hipSharedMemBankSizeEightByte); + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_HIPCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h new file mode 100644 index 000000000..ba9971050 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h @@ -0,0 +1,1119 @@ +//#include "hip/hip_runtime.h" +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array hipInputDimensions; + array hipOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + hipInputDimensions[index] = input_dims[indices[i]]; + hipOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + hipInputDimensions[written] = input_dims[i]; + hipOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_hipInputStrides[i] = + m_hipInputStrides[i - 1] * hipInputDimensions[i - 1]; + m_hipOutputStrides[i] = + m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1]; + } else { + m_hipInputStrides[i] = 1; + m_hipOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (i + 1 < offset) { + m_hipInputStrides[i] = + m_hipInputStrides[i + 1] * hipInputDimensions[i + 1]; + m_hipOutputStrides[i] = + m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1]; + } else { + m_hipInputStrides[i] = 1; + m_hipOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_hipInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_hipInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_hipInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_hipInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_hipOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_hipOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_hipOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_hipOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_hipInputStrides; + array m_hipOutputStrides; +}; + + + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template +__global__ void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = hipBlockIdx_y * hipBlockDim_y; + const int plane_stride = hipBlockDim_y * hipGridDim_y; + + for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = hipThreadIdx_y * num_x_input; + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = hipBlockIdx_y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = hipBlockIdx_z * hipBlockDim_z; + const int plane_stride = hipBlockDim_z * hipGridDim_z; + + for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = hipThreadIdx_z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { + #pragma unroll + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { + HIP_DYNAMIC_SHARED( float, s) + + // Load inputs to shared memory + const int first_x = hipBlockIdx_x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = hipBlockIdx_y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = hipBlockIdx_z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) { + for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { + for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { + const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); + + for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) { + for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { + for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumHipMultiProcessors(); + const int hipWarpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(hipWarpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, 4>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, 7>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024/block_size.x, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[idxX], m_indices[idxY]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 4, 7>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 7, 4>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D, Index, InputDims>), + dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, + numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 341889e88..e94e577fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,9 +35,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef EIGEN_CUDA_ARCH +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running on the host CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; #else // Running on a CUDA device return 32; @@ -45,7 +48,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +58,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running single threaded on the host CPU return l3CacheSize(); #else @@ -65,10 +68,14 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef EIGEN_CUDA_ARCH +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; #else // Running on a CUDA device return EIGEN_CUDA_ARCH / 100; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h new file mode 100644 index 000000000..c0e110987 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h @@ -0,0 +1,352 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H + +#if defined(EIGEN_HIPCC) +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#endif +#include //for sleep function + +namespace Eigen { + +static const int kHipScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const hipStream_t& stream() const = 0; + virtual const hipDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +static hipDeviceProp_t* m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if 0 && __cplusplus >= 201103L + static std::atomic first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. + int num_devices; + hipError_t status = hipGetDeviceCount(&num_devices); + if (status != hipSuccess) { + std::cerr << "Failed to get the number of HIP devices: " + << hipGetErrorString(status) + << std::endl; + assert(status == hipSuccess); + } + m_deviceProperties = new hipDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = hipGetDeviceProperties(&m_deviceProperties[i], i); + if (status != hipSuccess) { + std::cerr << "Failed to initialize HIP device #" + << i + << ": " + << hipGetErrorString(status) + << std::endl; + assert(status == hipSuccess); + } + } + +#if 0 && __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif + m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if 0 && __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } + } + } +} + +static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00; + +class HipStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + hipGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + HipStreamDevice(const hipStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + hipGetDevice(&device_); + } else { + int num_devices; + hipError_t err = hipGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + assert(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + virtual ~HipStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const hipStream_t& stream() const { return *stream_; } + const hipDeviceProp_t& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + hipError_t err = hipSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + void* result; + err = hipMalloc(&result, num_bytes); + assert(err == hipSuccess); + assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + hipError_t err = hipSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + assert(buffer != NULL); + err = hipFree(buffer); + assert(err == hipSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kHipScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kHipScratchSize; + semaphore_ = reinterpret_cast(scratch); + //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int)); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + return semaphore_; + } + + private: + const hipStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const hipStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + hipError_t err = + hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + hipError_t err = + hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream()); + hipError_t err = hipMemset(buffer, c, n); + EIGEN_UNUSED_VARIABLE(err) + assert(err == hipSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip devices. + return firstLevelCacheSize(); + } + +// FIXME - this will move into HIP +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#undef assert +#define assert(COND) +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#if defined(EIGEN_HIPCC) && \ + !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t err = hipStreamSynchronize(stream_->stream()); + if (err != hipSuccess) { + std::cerr << "Error detected in HIP stream: " + << hipGetErrorString(err) + << std::endl; + assert(err == hipSuccess); + } +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the HIP runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#if defined(EIGEN_HIPCC) + hipError_t error = hipStreamQuery(stream_->stream()); + return (error == hipSuccess) || (error == hipErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \ + assert(hipGetLastError() == hipSuccess); + + +// FIXME: Should be device and kernel specific. +#if defined(EIGEN_HIPCC) +static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) { +#if !defined(EIGEN_HIP_DEVICE_COMPILE) + hipError_t status = hipDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + assert(status == hipSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 0ffe68ab3..24a57970a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor { }; -#if defined(EIGEN_CUDACC) +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -250,6 +250,17 @@ inline void TensorExecutor::run( TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { +#if defined(EIGEN_HIPCC) + const int block_size = device.maxHipThreadsPerBlock(); + const int max_blocks = device.getNumHipMultiProcessors() * + device.maxHipThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel, Index>), + dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size); +#else const int block_size = device.maxCudaThreadsPerBlock(); const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size; @@ -260,11 +271,12 @@ inline void TensorExecutor::run( LAUNCH_CUDA_KERNEL( (EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); +#endif } evaluator.cleanup(); } -#endif // EIGEN_CUDACC +#endif // EIGEN_CUDACC || EIGEN_HIPCC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c015ce196..b8f0bc798 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -109,7 +109,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 3209fecd3..835efbf72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -350,7 +350,11 @@ struct IndexPairList : internal::IndexTuple { namespace internal { -template size_t array_prod(const IndexList& sizes) { +template + #if defined(EIGEN_HIPCC) + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + #endif + size_t array_prod(const IndexList& sizes) { size_t result = 1; for (int i = 0; i < array_size >::value; ++i) { result *= sizes[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index c9e61f359..8e1ba486d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 5431eb740..de1075cc1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) +#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16))) template <> struct PacketType { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e59074506..2a979845b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -858,7 +858,10 @@ struct TensorEvaluator __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) template __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template @@ -495,7 +495,11 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { + EIGEN_STRONG_INLINE + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. @@ -694,9 +698,9 @@ struct TensorEvaluator, #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); @@ -774,14 +778,22 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; +#if defined(EIGEN_HIPCC) + public: +#endif + // Evaluator for the input expression. TensorEvaluator m_impl; +#if defined(EIGEN_HIPCC) + private: +#endif + // Operation to apply for computing the reduction. Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h new file mode 100644 index 000000000..5304a22c5 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h @@ -0,0 +1,815 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + +#if defined(EIGEN_HIPCC) +#define HIP_WARP_SIZE 64 +#endif + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple hip thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another hip thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#if defined(EIGEN_HAS_HIP_FP16) +template
+
$projectname  $projectnumber
@@ -42,7 +40,7 @@ $mathjax -
+
$projectbrief