diff --git a/Eigen/Core b/Eigen/Core index 834ff9415..7107f83d0 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -333,6 +333,7 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/CUDA/TypeCasting.h" #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 000000000..7f99376fb --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,220 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + +namespace Eigen { + +namespace internal { + +#if defined(EIGEN_HAS_CUDA_FP16) + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + assert(false && "tbd"); + return half(); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} + + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasErf = 1, + HasErfc = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const half& from) { + return __half2half2(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { + return __halves2half2(a, __hadd(a, __float2half(1))); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { + return __hmul2(a, b); +} + + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); + } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + return __ldg((const half2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + return __halves2half2(__ldg(from+0), __ldg(from+1)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { + assert(false && "tbd"); + return half2(); +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + assert(false && "tbd"); + // half tmp = kernel.packet[0].y; + // kernel.packet[0].y = kernel.packet[1].x; + // kernel.packet[1].x = tmp; +} + +#endif +#endif +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index aee222a14..26c18a718 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -20,6 +20,7 @@ using Eigen::Tensor; #ifdef EIGEN_HAS_CUDA_FP16 + void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -53,11 +54,53 @@ void test_cuda_conversion() { gpu_device.deallocate(d_half); gpu_device.deallocate(d_conv); } + +void test_cuda_elementwise() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; + gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + #endif + void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_element_wise()); #endif }