diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 04a321b9f..3bba02278 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -287,6 +287,14 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet version of \a *from, (un-aligned masked load) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template EIGEN_DEVICE_FUNC inline +typename enable_if::masked_load_available, Packet>::type +ploadu(const typename unpacket_traits::type* from, typename unpacket_traits::mask_t umask); + /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 5b8ff59bd..3d229cd81 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -47,7 +47,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -263,7 +263,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index f88e36024..3f94f85eb 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -117,14 +117,15 @@ template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet8i integer_packet; - enum {size=8, alignment=Aligned32, vectorizable=true}; + typedef uint8_t mask_t; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true}; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - enum {size=4, alignment=Aligned32, vectorizable=true}; + enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } @@ -385,6 +386,14 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EI template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { + __m256i mask = _mm256_set1_epi8(static_cast(umask)); + const __m256i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = _mm256_or_si256(mask, bit_mask); + mask = _mm256_cmpeq_epi32(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); +} + // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 9a89dd01f..5ab2ffe77 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -51,7 +51,8 @@ template<> struct unpacket_traits { enum { size = 8, alignment=unpacket_traits::alignment, - vectorizable=true + vectorizable=true, + masked_load_available=false }; typedef Packet4cf half; }; @@ -247,7 +248,8 @@ template<> struct unpacket_traits { enum { size = 4, alignment = unpacket_traits::alignment, - vectorizable=true + vectorizable=true, + masked_load_available=false }; typedef Packet2cd half; }; diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 60b723b08..094309eda 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -102,19 +102,20 @@ struct unpacket_traits { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - enum { size = 16, alignment=Aligned64, vectorizable=true }; + typedef uint16_t mask_t; + enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64, vectorizable=true }; + enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64, vectorizable=false }; + enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false }; }; template <> @@ -485,6 +486,12 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { reinterpret_cast(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f ploadu(const float* from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from); +} + // Loads 8 floats from memory a returns the packet // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 0c9f8e05d..ebc3b2aeb 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -60,7 +60,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -291,7 +291,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 9535724eb..b5484e6aa 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -192,13 +192,13 @@ template<> struct unpacket_traits typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=false}; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false}; }; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) @@ -921,7 +921,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index cd4615a45..7fac0a5e1 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -92,8 +92,8 @@ template<> struct packet_traits : default_packet_traits }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef double2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { return make_float4(from, from, from, from); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index cd518c7e4..2bee56f0f 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -42,7 +42,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { return __half2half2(from); @@ -567,7 +567,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; typedef uint16_t mask_t; enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=true}; typedef Packet16h half; }; template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { Packet16h result; @@ -591,6 +591,14 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* fr return result; } +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from, + uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + Packet16h result; + result.x = _mm256_maskz_loadu_epi16(mask, from); + return result; +} + template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { // (void*) -> workaround clang warning: // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 @@ -1056,7 +1064,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet8h half; }; template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { Packet8h result; @@ -1419,7 +1427,7 @@ struct packet_traits : default_packet_traits { }; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet4h half; }; template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { Packet4h result; diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index fa64d3564..0ced061c5 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -127,7 +127,7 @@ struct packet_traits > : default_packet_traits { template <> struct unpacket_traits { typedef std::complex type; - enum { size = 2, alignment = Aligned16, vectorizable=true }; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet2cf half; }; @@ -500,7 +500,7 @@ struct packet_traits > : default_packet_traits { template <> struct unpacket_traits { typedef std::complex type; - enum { size = 1, alignment = Aligned16, vectorizable=true }; + enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet1cd half; }; diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index a97156a84..f426d5b83 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -117,14 +117,14 @@ struct packet_traits : default_packet_traits { template <> struct unpacket_traits { typedef float type; - enum { size = 4, alignment = Aligned16, vectorizable=true }; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet4f half; }; template <> struct unpacket_traits { typedef int32_t type; - enum { size = 4, alignment = Aligned16, vectorizable=true }; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet4i half; }; @@ -925,7 +925,7 @@ struct packet_traits : default_packet_traits { template <> struct unpacket_traits { typedef double type; - enum { size = 2, alignment = Aligned16, vectorizable=true }; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false }; typedef Packet2d half; }; diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index f6c5c211c..c17d0a03e 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -62,7 +62,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -340,7 +340,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e8b351849..b8051cfbf 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,13 +145,13 @@ template<> struct unpacket_traits typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits { typedef int32_t type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } @@ -657,7 +657,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2d half; }; template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index f39988eac..7d89c323b 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -50,7 +50,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -283,7 +283,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9c3750af0..04b63601b 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -166,17 +166,17 @@ template<> struct unpacket_traits { typedef float type; typedef Packet4f half; typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true}; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits { typedef double type; typedef Packet2d half; - enum {size=2, alignment=Aligned16, vectorizable=true}; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; }; template<> struct unpacket_traits { typedef int type; typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=false}; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false}; }; #ifndef EIGEN_VECTORIZE_AVX diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index 294cb101a..1afa63bb2 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2) #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ template<> struct unpacket_traits {\ typedef unpacket_type type;\ - enum {size=lengths, alignment=Aligned16, vectorizable=true};\ + enum {size=lengths, alignment=Aligned16, vectorizable=true, masked_load_available=false};\ typedef packet_type half;\ }; SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 167c3ee4c..9fcbcb82d 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,8 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index c8e90f1a8..74e0a1313 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -239,9 +239,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 91c2e42e4..ce019946b 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -185,7 +185,8 @@ template struct unpacket_traits { size = 1, alignment = 1, - vectorizable = false + vectorizable = false, + masked_load_available=false }; }; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 4906f6eb0..200670b8c 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -118,6 +118,9 @@ struct packet_helper template inline Packet load(const T* from) const { return internal::pload(from); } + template + inline Packet load(const T* from, unsigned long long umask) const { return internal::ploadu(from, umask); } + template inline void store(T* to, const Packet& x) const { internal::pstore(to,x); } }; @@ -128,6 +131,9 @@ struct packet_helper template inline T load(const T* from) const { return *from; } + template + inline T load(const T* from, unsigned long long) const { return *from; } + template inline void store(T* to, const T& x) const { *to = x; } }; @@ -169,6 +175,7 @@ template void packetmath() const int size = PacketSize*max_size; EIGEN_ALIGN_MAX Scalar data1[size]; EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar data3[size]; EIGEN_ALIGN_MAX Packet packets[PacketSize*2]; EIGEN_ALIGN_MAX Scalar ref[size]; RealScalar refvalue = RealScalar(0); @@ -194,6 +201,22 @@ template void packetmath() VERIFY(areApprox(data1, data2+offset, PacketSize) && "internal::pstoreu"); } + if (internal::unpacket_traits::masked_load_available) + { + unsigned long long max_umask = (0x1ull << PacketSize); + for (int offset=0; offset::masked_load_available, Packet> h; + h.store(data2, h.load(data1+offset, umask)); + for (int k=0; k> k) ? data1[k+offset] : Scalar(0); + VERIFY(areApprox(data3, data2, PacketSize) && "internal::ploadu masked"); + } + } + } + for (int offset=0; offset(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -244,6 +256,18 @@ struct TensorEvaluator return internal::ploadt_ro(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { eigen_assert(m_data); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords)