diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index e406840a3..ece04b754 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -849,7 +849,7 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x) { return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x); -} +} template EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 017d47a88..1487c53ca 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -10,22 +10,24 @@ #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H #define EIGEN_PACKET_MATH_HALF_CUDA_H -#if defined(EIGEN_HAS_CUDA_FP16) +//#if defined(EIGEN_HAS_CUDA_FP16) // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +//#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + -// Most of the following operations require arch >= 3.0 -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 namespace Eigen { namespace internal { +// Most of the following operations require arch >= 3.0 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + template<> struct is_arithmetic { enum { value = true }; }; -template<> struct packet_traits : default_packet_traits +template<> struct packet_traits : default_packet_traits { typedef half2 type; typedef half2 half; @@ -43,35 +45,35 @@ template<> struct packet_traits : default_packet_traits }; -template<> struct unpacket_traits { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const half& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { return __half2half2(from); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { return *reinterpret_cast(from); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { +template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { return __halves2half2(from[0], from[0]); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { *reinterpret_cast(to) = from; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { to[0] = __low2half(from); to[1] = __high2half(from); } template<> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { #if __CUDA_ARCH__ >= 350 return __ldg((const half2*)from); #else @@ -80,7 +82,7 @@ template<> } template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { #if __CUDA_ARCH__ >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else @@ -88,16 +90,16 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const ha #endif } -template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { +template<> EIGEN_DEVICE_FUNC inline half2 pgather(const Eigen::half* from, Index stride) { return __halves2half2(from[0*stride], from[1*stride]); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { +template<> EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const half2& from, Index stride) { to[stride*0] = __low2half(from); to[stride*1] = __high2half(from); } -template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { +template<> EIGEN_DEVICE_FUNC inline Eigen::half pfirst(const half2& a) { return __low2half(a); } @@ -110,15 +112,15 @@ template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - half a1 = __low2half(kernel.packet[0]); - half a2 = __high2half(kernel.packet[0]); - half b1 = __low2half(kernel.packet[1]); - half b2 = __high2half(kernel.packet[1]); + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); kernel.packet[0] = __halves2half2(a1, b1); kernel.packet[1] = __halves2half2(a2, b2); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { #if __CUDA_ARCH__ >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else @@ -227,17 +229,17 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& return __halves2half2(r1, r2); } -template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { +template<> EIGEN_DEVICE_FUNC inline Eigen::half predux(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hadd(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); float a2 = __high2float(a); - return half(internal::raw_uint16_to_half(__float2half_rn(a1 + a2))); + return Eigen::half(internal::raw_uint16_to_half(__float2half_rn(a1 + a2))); #endif } -template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { +template<> EIGEN_DEVICE_FUNC inline Eigen::half predux_max(const half2& a) { #if __CUDA_ARCH__ >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -249,7 +251,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { #endif } -template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { +template<> EIGEN_DEVICE_FUNC inline Eigen::half predux_min(const half2& a) { #if __CUDA_ARCH__ >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -261,13 +263,13 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { #endif } -template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { +template<> EIGEN_DEVICE_FUNC inline Eigen::half predux_mul(const half2& a) { #if __CUDA_ARCH__ >= 530 return __hmul(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); float a2 = __high2float(a); - return half(internal::raw_uint16_to_half(__float2half_rn(a1 * a2))); + return Eigen::half(internal::raw_uint16_to_half(__float2half_rn(a1 * a2))); #endif } @@ -303,20 +305,7 @@ template<> EIGEN_DEVICE_FUNC inline half2 prsqrt(const half2& a) { return __floats2half2_rn(r1, r2); } -} // end namespace internal - -} // end namespace Eigen - -#endif -#endif - -#else // EIGEN_HAS_CUDA_FP16 - - -namespace Eigen { -namespace internal { - -#if defined EIGEN_VECTORIZE_AVX +#elif defined EIGEN_VECTORIZE_AVX typedef struct { __m128i x; @@ -326,7 +315,7 @@ typedef struct { template<> struct is_arithmetic { enum { value = true }; }; template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef Packet8h type; // There is no half-size packet for Packet8h. typedef Packet8h half; @@ -357,38 +346,38 @@ struct packet_traits : default_packet_traits { template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h pset1(const half& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { Packet8h result; result.x = _mm_set1_epi16(from.x); return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half pfirst(const Packet8h& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { return raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h pload(const half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { Packet8h result; result.x = _mm_load_si128(reinterpret_cast(from)); return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h ploadu(const half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { Packet8h result; result.x = _mm_loadu_si128(reinterpret_cast(from)); return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const Packet8h& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { _mm_store_si128((__m128i*)to, from.x); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const Packet8h& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { _mm_storeu_si128((__m128i*)to, from.x); } template<> EIGEN_DEVICE_FUNC inline Packet8h -ploadquad(const half* from) { +ploadquad(const Eigen::half* from) { Packet8h result; unsigned short a = from[0].x; unsigned short b = from[1].x; @@ -400,7 +389,7 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { #ifdef EIGEN_HAS_FP16_C return _mm256_cvtph_ps(a.x); #else - EIGEN_ALIGN32 half aux[8]; + EIGEN_ALIGN32 Eigen::half aux[8]; pstore(aux, a); float f0(aux[0]); float f1(aux[1]); @@ -423,14 +412,14 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #else EIGEN_ALIGN32 float aux[8]; pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); Packet8h result; result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); @@ -438,6 +427,8 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #endif } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -452,16 +443,16 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8h pmul(const P return float2half(rf); } -template<> EIGEN_DEVICE_FUNC inline Packet8h pgather(const half* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet8h pgather(const Eigen::half* from, Index stride) { Packet8h result; result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); return result; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const Packet8h& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const Packet8h& from, Index stride) { - EIGEN_ALIGN32 half aux[8]; + EIGEN_ALIGN32 Eigen::half aux[8]; pstore(aux, from); to[stride*0].x = aux[0].x; to[stride*1].x = aux[1].x; @@ -523,13 +514,13 @@ ptranspose(PacketBlock& kernel) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN32 half in[4][8]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); - EIGEN_ALIGN32 half out[4][8]; + EIGEN_ALIGN32 Eigen::half out[4][8]; for (int i = 0; i < 4; ++i) { for (int j = 0; j < 4; ++j) { @@ -557,7 +548,7 @@ typedef struct { template<> struct is_arithmetic { enum { value = true }; }; template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef Packet4h type; // There is no half-size packet for Packet8h. typedef Packet4h half; @@ -588,24 +579,26 @@ struct packet_traits : default_packet_traits { template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pset1(const half& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { Packet4h result; result.x = _mm_set1_pi16(from.x); return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half pfirst(const Packet4h& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { return raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { __int64_t a64 = _mm_cvtm64_si64(a.x); __int64_t b64 = _mm_cvtm64_si64(b.x); - half h[4]; + Eigen::half h[4]; - half ha = raw_uint16_to_half(static_cast(a64)); - half hb = raw_uint16_to_half(static_cast(b64)); + Eigen::half ha = raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = raw_uint16_to_half(static_cast(b64)); h[0] = ha + hb; ha = raw_uint16_to_half(static_cast(a64 >> 16)); hb = raw_uint16_to_half(static_cast(b64 >> 16)); @@ -625,10 +618,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pmul(const P __int64_t a64 = _mm_cvtm64_si64(a.x); __int64_t b64 = _mm_cvtm64_si64(b.x); - half h[4]; + Eigen::half h[4]; - half ha = raw_uint16_to_half(static_cast(a64)); - half hb = raw_uint16_to_half(static_cast(b64)); + Eigen::half ha = raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = raw_uint16_to_half(static_cast(b64)); h[0] = ha * hb; ha = raw_uint16_to_half(static_cast(a64 >> 16)); hb = raw_uint16_to_half(static_cast(b64 >> 16)); @@ -644,36 +637,41 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pmul(const P return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pload(const half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { Packet4h result; result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h ploadu(const half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { Packet4h result; result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); return result; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const Packet4h& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { __int64_t r = _mm_cvtm64_si64(from.x); *(reinterpret_cast<__int64_t*>(to)) = r; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const Packet4h& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { __int64_t r = _mm_cvtm64_si64(from.x); *(reinterpret_cast<__int64_t*>(to)) = r; } -template<> EIGEN_DEVICE_FUNC inline Packet4h pgather(const half* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4h pgather(const Eigen::half* from, Index stride) { Packet4h result; result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); return result; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const Packet4h& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const Packet4h& from, Index stride) { __int64_t a = _mm_cvtm64_si64(from.x); to[stride*0].x = static_cast(a); @@ -696,9 +694,8 @@ ptranspose(PacketBlock& kernel) { } #endif -} -} -#endif // EIGEN_HAS_CUDA_FP16 +} +} #endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 45c75cf52..150c7e94a 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -14,49 +14,47 @@ namespace Eigen { namespace internal { -#if defined(EIGEN_HAS_CUDA_FP16) - template<> -struct scalar_cast_op { +struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const { + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(a); #else - return half(a); + return Eigen::half(a); #endif } }; template<> -struct functor_traits > +struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; template<> -struct scalar_cast_op { +struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const { + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __float2half(static_cast(a)); #else - return half(static_cast(a)); + return Eigen::half(static_cast(a)); #endif } }; template<> -struct functor_traits > +struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; template<> -struct scalar_cast_op { +struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 return __half2float(a); #else @@ -66,7 +64,7 @@ struct scalar_cast_op { }; template<> -struct functor_traits > +struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; @@ -74,7 +72,7 @@ struct functor_traits > #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 template <> -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 2, @@ -89,7 +87,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(con } template <> -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, @@ -99,16 +97,13 @@ struct type_casting_traits { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { // Simply discard the second half of the input - return __float22half2_rn(make_float2(a.x, a.y)); + return __floats2half2_rn(a.x, a.y); } -#endif +#elif defined EIGEN_VECTORIZE_AVX -#else - -#ifdef EIGEN_VECTORIZE_AVX template <> -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, @@ -121,7 +116,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h } template <> -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, @@ -134,8 +129,9 @@ template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f } #elif defined EIGEN_VECTORIZE_SSE && !EIGEN_COMP_MSVC + template <> -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, @@ -145,7 +141,7 @@ struct type_casting_traits { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { __int64_t a64 = _mm_cvtm64_si64(a.x); - half h = raw_uint16_to_half(static_cast(a64)); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); float f1 = static_cast(h); h = raw_uint16_to_half(static_cast(a64 >> 16)); float f2 = static_cast(h); @@ -157,7 +153,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pcast -struct type_casting_traits { +struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, @@ -168,17 +164,16 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { EIGEN_ALIGN16 float aux[4]; pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); Packet4h result; result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); return result; } -#endif #endif } // end namespace internal