From a9cc6a06b9fe2ea50b0437f73b101116a39da727 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 9 Feb 2016 05:10:06 +0000 Subject: [PATCH 01/18] Fixed compilation warning in the splines test --- unsupported/test/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp index 97665af96..3be020434 100644 --- a/unsupported/test/splines.cpp +++ b/unsupported/test/splines.cpp @@ -239,7 +239,7 @@ void check_global_interpolation_with_derivatives2d() typedef Spline2d::PointType PointType; typedef Spline2d::KnotVectorType KnotVectorType; - const unsigned int numPoints = 100; + const Eigen::DenseIndex numPoints = 100; const unsigned int dimension = 2; const unsigned int degree = 3; From 06a2bc7c9c6af150f54605c74a95379a7c12ca28 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 17 Feb 2016 14:41:59 -0800 Subject: [PATCH 02/18] Tiny bugfix in SpecialFunctions: some compilers don't like doubles implicitly downcast to floats in an array constructor. --- Eigen/src/Core/SpecialFunctions.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h index 6c6b21f98..6b4598e3e 100644 --- a/Eigen/src/Core/SpecialFunctions.h +++ b/Eigen/src/Core/SpecialFunctions.h @@ -182,10 +182,10 @@ struct digamma_impl_maybe_poly { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float s) { const float A[] = { - -4.16666666666666666667E-3, - 3.96825396825396825397E-3, - -8.33333333333333333333E-3, - 8.33333333333333333333E-2 + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f }; float z; From 8ce46f9d8959236c0dfb6dd7dca7423d825f0c59 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 18 Feb 2016 13:24:34 -0800 Subject: [PATCH 03/18] Improved implementation of ptanh for SSE and AVX --- Eigen/src/Core/arch/AVX/MathFunctions.h | 42 +++++++++++-------------- Eigen/src/Core/arch/SSE/MathFunctions.h | 33 +++++++++---------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h index a24bf6e26..98d8e029f 100644 --- a/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -267,31 +267,34 @@ pexp(const Packet8f& _x) { // Hyperbolic Tangent function. // Doesn't do anything fancy, just a 13/6-degree rational interpolant which -// is accurate up to a couple of ulp in the range [-8, 8], outside of which the +// is accurate up to a couple of ulp in the range [-9, 9], outside of which the // fl(tanh(x)) = +/-1. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f ptanh(const Packet8f& _x) { - // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later. - const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f)); + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + _EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f); + _EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f); + const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x)); // The monomial coefficients of the numerator polynomial (odd). - _EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04f); - _EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f); + _EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f); // The monomial coefficients of the denominator polynomial (even). - _EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04f); - _EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03f); - _EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02f); - _EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02f); + _EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f); + _EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f); + _EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f); // Since the polynomials are odd/even, we need x^2. - const Packet8f x2 = _mm256_mul_ps(x, x); + const Packet8f x2 = pmul(x, x); // Evaluate the numerator polynomial p. Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11); @@ -308,14 +311,7 @@ ptanh(const Packet8f& _x) { q = pmadd(x2, q, p8f_beta_0); // Divide the numerator by the denominator. - const Packet8f res = pdiv(p, q); - - // Mask-out values outside of [-8, 8]. - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(minus_one, -1.0f); - return _mm256_blendv_ps( - _mm256_blendv_ps(res, p8f_one, _mm256_cmp_ps(x, p8f_one, _CMP_GT_OQ)), - p8f_minus_one, _mm256_cmp_ps(x, p8f_minus_one, _CMP_LT_OQ)); + return pdiv(p, q); } template <> diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index a7a0d906f..28f103eeb 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -518,30 +518,31 @@ Packet2d prsqrt(const Packet2d& x) { // Hyperbolic Tangent function. // Doesn't do anything fancy, just a 13/6-degree rational interpolant which -// is accurate up to a couple of ulp in the range [-8, 8], outside of which the +// is accurate up to a couple of ulp in the range [-9, 9], outside of which the // fl(tanh(x)) = +/-1. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& _x) { - // Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later. - const Packet4f x = - pmax(pset1(-1.0f), - pmin(pset1(1.0f), pmul(_x, pset1(0.125f)))); + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + _EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f); + _EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f); + const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x)); // The monomial coefficients of the numerator polynomial (odd). - _EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04f); - _EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); // The monomial coefficients of the denominator polynomial (even). - _EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04f); - _EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03f); - _EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02f); - _EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02f); + _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f); + _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f); + _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f); // Since the polynomials are odd/even, we need x^2. const Packet4f x2 = pmul(x, x); From 17b9fbed34cefe08b4f63dbe0734e12311eb8669 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 06:16:07 +0000 Subject: [PATCH 04/18] Added preliminary support for half floats on CUDA GPU. For now we can simply convert floats into half floats and vice versa --- Eigen/Core | 3 + Eigen/src/Core/arch/CUDA/PacketMath.h | 1 - Eigen/src/Core/arch/CUDA/TypeCasting.h | 100 +++++++++++++++++++++++++ unsupported/test/CMakeLists.txt | 8 +- 4 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 Eigen/src/Core/arch/CUDA/TypeCasting.h diff --git a/Eigen/Core b/Eigen/Core index 63602f4c3..17f864084 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -200,6 +200,7 @@ #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include + #include #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -329,7 +330,9 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" + #include "src/Core/arch/CUDA/TypeCasting.h" #endif #include "src/Core/arch/Default/Settings.h" diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d3d9f910e..d5dcc7fa3 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -21,7 +21,6 @@ namespace internal { template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; - template<> struct packet_traits : default_packet_traits { typedef float4 type; diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h new file mode 100644 index 000000000..a8c06ff48 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -0,0 +1,100 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CUDA_H +#define EIGEN_TYPE_CASTING_CUDA_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __float2half(a); + #else + assert(false && "tbd"); + return half(); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __half2float(a); + #else + assert(false && "tbd"); + return 0.0f; + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +#else + assert(false && "tbd"); + return float4(); +#endif +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __float22half2_rn(make_float2(a.x, a.y)); +#else + assert(false && "tbd"); + return half2(); +#endif +} + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index c202cf0e4..678a0d1d7 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -37,9 +37,9 @@ if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$") ei_add_test(BVH) endif() -ei_add_test(matrix_exponential) +#ei_add_test(matrix_exponential) ei_add_test(matrix_function) -ei_add_test(matrix_power) +#ei_add_test(matrix_power) ei_add_test(matrix_square_root) ei_add_test(alignedvector3) @@ -173,5 +173,9 @@ if(CUDA_FOUND) ei_add_test(cxx11_tensor_random_cuda) ei_add_test(cxx11_tensor_argmax_cuda) + set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_53 -Xcudafe \"--display_error_number\"") + ei_add_test(cxx11_tensor_of_float16_cuda) + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() From 7151bd876845c15cb6b8abc0886d7917ece635ed Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 06:20:50 +0000 Subject: [PATCH 05/18] Reverted unintended changes introduced by a bad merge --- Eigen/Core | 1 - unsupported/test/CMakeLists.txt | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 17f864084..3edbe6585 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -330,7 +330,6 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" - #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/CUDA/TypeCasting.h" #endif diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 678a0d1d7..2c686177b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -37,9 +37,9 @@ if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$") ei_add_test(BVH) endif() -#ei_add_test(matrix_exponential) +ei_add_test(matrix_exponential) ei_add_test(matrix_function) -#ei_add_test(matrix_power) +ei_add_test(matrix_power) ei_add_test(matrix_square_root) ei_add_test(alignedvector3) From f36c0c2c65a78959f6ccbbc29c6e80f86b062bc8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 06:23:28 +0000 Subject: [PATCH 06/18] Added regression test for float16 --- .../test/cxx11_tensor_of_float16_cuda.cu | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_of_float16_cuda.cu diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu new file mode 100644 index 000000000..e9f5dd968 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -0,0 +1,60 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cuda_conversion() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half)); + float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random(); + gpu_half.device(gpu_device) = gpu_float.cast(); + gpu_conv.device(gpu_device) = gpu_half.cast(); + + Tensor initial(num_elem); + Tensor final(num_elem); + gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_half); + gpu_device.deallocate(d_conv); +} + + +void test_cxx11_tensor_of_float16_cuda() +{ + CALL_SUBTEST_1(test_cuda_conversion()); +} From 0606a0a39bcf01b0a03f0dcd17f7075fce8c402c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 18 Feb 2016 23:15:23 -0800 Subject: [PATCH 07/18] FP16 on CUDA are only available starting with cuda 7.5. Disable them when using an older version of CUDA --- Eigen/Core | 5 ++++- Eigen/src/Core/arch/CUDA/TypeCasting.h | 3 +++ unsupported/test/cxx11_tensor_of_float16_cuda.cu | 5 ++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 3edbe6585..834ff9415 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -200,7 +200,10 @@ #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include - #include + #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #include + #endif #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index a8c06ff48..279fd4fd0 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -14,6 +14,8 @@ namespace Eigen { namespace internal { +#if defined(EIGEN_HAS_CUDA_FP16) + template<> struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) @@ -92,6 +94,7 @@ template<> EIGEN_STRONG_INLINE half2 pcast(const float4& a) { #endif } +#endif } // end namespace internal diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index e9f5dd968..aee222a14 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -19,6 +19,7 @@ using Eigen::Tensor; +#ifdef EIGEN_HAS_CUDA_FP16 void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -52,9 +53,11 @@ void test_cuda_conversion() { gpu_device.deallocate(d_half); gpu_device.deallocate(d_conv); } - +#endif void test_cxx11_tensor_of_float16_cuda() { +#ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); +#endif } From ac5d706a942faa275ea467009d2004a7aeb3e3fa Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 08:19:12 +0000 Subject: [PATCH 08/18] Added support for simple coefficient wise tensor expression using half floats on CUDA devices --- Eigen/Core | 1 + Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 220 ++++++++++++++++++ .../test/cxx11_tensor_of_float16_cuda.cu | 43 ++++ 3 files changed, 264 insertions(+) create mode 100644 Eigen/src/Core/arch/CUDA/PacketMathHalf.h diff --git a/Eigen/Core b/Eigen/Core index 834ff9415..7107f83d0 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -333,6 +333,7 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/CUDA/TypeCasting.h" #endif diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 000000000..7f99376fb --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,220 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + +namespace Eigen { + +namespace internal { + +#if defined(EIGEN_HAS_CUDA_FP16) + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + assert(false && "tbd"); + return half(); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} + + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasErf = 1, + HasErfc = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const half& from) { + return __half2half2(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const half& a) { + return __halves2half2(a, __hadd(a, __float2half(1))); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { + return __hadd2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { + return __hsub2(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { + return __hneg2(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { + return __hmul2(a, b); +} + + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { + return __hfma2(a, b, c); + } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const half* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + return __ldg((const half2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const half* from) { + return __halves2half2(__ldg(from+0), __ldg(from+1)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half predux(const half2& a) { + return __hadd(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half predux_max(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hgt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_min(const half2& a) { + half first = __low2half(a); + half second = __high2half(a); + return __hlt(first, second) ? first : second; +} + +template<> EIGEN_DEVICE_FUNC inline half predux_mul(const half2& a) { + return __hmul(__low2half(a), __high2half(a)); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs(const half2& a) { + assert(false && "tbd"); + return half2(); +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + assert(false && "tbd"); + // half tmp = kernel.packet[0].y; + // kernel.packet[0].y = kernel.packet[1].x; + // kernel.packet[1].x = tmp; +} + +#endif +#endif +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index aee222a14..26c18a718 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -20,6 +20,7 @@ using Eigen::Tensor; #ifdef EIGEN_HAS_CUDA_FP16 + void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); @@ -53,11 +54,53 @@ void test_cuda_conversion() { gpu_device.deallocate(d_half); gpu_device.deallocate(d_conv); } + +void test_cuda_elementwise() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; + gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + #endif + void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_element_wise()); #endif } From cd042dbbfdd4680c983d89c4f526c49d4657c05d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:03:26 +0000 Subject: [PATCH 09/18] Fixed a bug in the tensor type converter --- unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index d2defcaf4..e254c0b7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -124,9 +124,12 @@ struct PacketConverter { return internal::pcast(m_impl.template packet(index)); } else { const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; for (int i = 0; i < TgtPacketSize; ++i) { - values[i] = m_impl.coeff(index+i); + values[i] = converter(m_impl.coeff(index+i)); } TgtPacket rslt = internal::pload(values); return rslt; From dc26459b9910d8c1fda964917635ee8277dd2614 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:16:54 +0000 Subject: [PATCH 10/18] Implemented protate() for CUDA --- Eigen/src/Core/arch/CUDA/PacketMath.h | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index d5dcc7fa3..a32b41e18 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -272,6 +272,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) return a.x * a.y; } +template +struct protate_impl +{ + static float4 run(const float4& a) { + if (offset == 0) { + return make_float4(a.x, a.y, a.z, a.w); + } + if (offset == 1) { + return make_float4(a.w, a.x, a.y, a.z); + } + if (offset == 2) { + return make_float4(a.z, a.w, a.x, a.y); + } + return make_float4(a.y, a.z, a.w, a.x); + } +}; + +template +struct protate_impl +{ + static double2 run(const double2& a) { + if (offset == 0) { + return make_double2(a.x, a.y); + } + return make_double2(a.y, a.x); + } +}; + + template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); } From f7cb755299b8cdee3b8eaffd2941af9ee6d08b04 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:57:26 +0000 Subject: [PATCH 11/18] Added support for operators +=, -=, *= and /= on CUDA half floats --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 7f99376fb..c99c1acf7 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -39,6 +39,22 @@ __device__ half operator / (const half& a, const half& b) { __device__ half operator - (const half& a) { return __hneg(a); } +__device__ half operator += (half& a, const half& b) { + a = __hadd(a, b); + return a; +} +__device__ half operator *= (half& a, const half& b) { + a = __hmul(a, b); + return a; +} +__device__ half operator -= (half& a, const half& b) { + a = __hsub(a, b); + return a; +} +__device__ half operator /= (half& a, const half& b) { + assert(false && "tbd"); + return a; +} template<> struct is_arithmetic { enum { value = true }; }; From f3352e0fb02d1048a4c21c969b10e84185f4e5bf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:58:57 +0000 Subject: [PATCH 12/18] Don't make the array constructors explicit --- unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h index 56e2b8afc..eae8b996c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h @@ -42,7 +42,7 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } - explicit EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v) { EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v; From a08d2ff0c911f7f27dd8cb0ca14fa5b9419b3488 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 15:59:59 +0000 Subject: [PATCH 13/18] Started to work on contractions and reductions using half floats --- .../test/cxx11_tensor_of_float16_cuda.cu | 95 ++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 26c18a718..d3cd94cd6 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -94,6 +94,97 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_float); } +/* +void test_cuda_contractions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int rows = 101; + int cols = 101; + int num_elem = rows*cols; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, rows, cols); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims(DimPair(1, 0)); + gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims); + gpu_res_half.device(gpu_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims).cast(); + + Tensor half_prec(rows, cols); + Tensor full_prec(rows, cols); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} + + +void test_cuda_reductions() { + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int size = 101; + int num_elem = size*size; + + float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, size); + + gpu_float1.device(gpu_device) = gpu_float1.random(); + gpu_float2.device(gpu_device) = gpu_float2.random(); + + Eigen::array redux_dim = {{0}}; + gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim); + gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim).cast(); + + Tensor half_prec(size); + Tensor full_prec(size); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + + for (int i = 0; i < size; ++i) { + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float1); + gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} +*/ + #endif @@ -101,6 +192,8 @@ void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_element_wise()); + CALL_SUBTEST_1(test_cuda_elementwise()); +// CALL_SUBTEST_2(test_cuda_contractions()); +// CALL_SUBTEST_3(test_cuda_reductions()); #endif } From f268db1c4bd0510f13f0218205c2e135f2790175 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 16:31:04 +0000 Subject: [PATCH 14/18] Added the ability to query the minor version of a cuda device --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index e684ab8f7..3808eb155 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -247,6 +247,14 @@ struct GpuDevice { return 0; #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const { +#ifndef __CUDA_ARCH__ + return stream_->deviceProperties().minor; +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return 0; +#endif + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; From 5c4901b83a3ec15988521e195abc05e804c541dc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 10:03:19 -0800 Subject: [PATCH 15/18] Implemented the scalar division of 2 half floats --- Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index c99c1acf7..d0106f4f1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -33,8 +33,9 @@ __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } __device__ half operator / (const half& a, const half& b) { - assert(false && "tbd"); - return half(); + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); } __device__ half operator - (const half& a) { return __hneg(a); @@ -52,7 +53,7 @@ __device__ half operator -= (half& a, const half& b) { return a; } __device__ half operator /= (half& a, const half& b) { - assert(false && "tbd"); + a = a / b; return a; } From 180156ba1aefceae0bd93f056e5807a83ccbb1b5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 10:05:59 -0800 Subject: [PATCH 16/18] Added support for tensor reductions on half floats --- Eigen/src/Core/arch/CUDA/TypeCasting.h | 20 ++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 15 +++++++----- .../test/cxx11_tensor_of_float16_cuda.cu | 23 +++++++++++++------ 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/CUDA/TypeCasting.h index 279fd4fd0..2742a4e7b 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -34,6 +34,26 @@ template<> struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __float2half(static_cast(a)); + #else + assert(false && "tbd"); + return half(); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + template<> struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index f94ffa020..e2d876140 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -72,11 +72,12 @@ template struct SumReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return static_cast(0); + internal::scalar_cast_op conv; + return conv(0); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(0); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; @@ -110,11 +111,12 @@ template struct MeanReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return static_cast(0); + internal::scalar_cast_op conv; + return conv(0); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(0); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum / scalarCount_; @@ -214,11 +216,12 @@ template struct ProdReducer } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return static_cast(1); + internal::scalar_cast_op conv; + return conv(1); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(1); + return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index d3cd94cd6..5ce96a1c2 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -93,7 +93,6 @@ void test_cuda_elementwise() { gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } - /* void test_cuda_contractions() { Eigen::CudaStreamDevice stream; @@ -139,7 +138,7 @@ void test_cuda_contractions() { gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); -} +}*/ void test_cuda_reductions() { @@ -183,7 +182,7 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } -*/ + #endif @@ -191,9 +190,19 @@ void test_cuda_reductions() { void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 - CALL_SUBTEST_1(test_cuda_conversion()); - CALL_SUBTEST_1(test_cuda_elementwise()); -// CALL_SUBTEST_2(test_cuda_contractions()); -// CALL_SUBTEST_3(test_cuda_reductions()); + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice device(&stream); + if (device.majorDeviceVersion() > 5 || + (device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) { + CALL_SUBTEST_1(test_cuda_conversion()); + CALL_SUBTEST_1(test_cuda_elementwise()); +// CALL_SUBTEST_2(test_cuda_contractions()); + CALL_SUBTEST_3(test_cuda_reductions()); + } + else { + std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl; + } +#else + std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif } From 670db7988d6903fbb51c449383c4d5162d83caaf Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 13:03:26 -0800 Subject: [PATCH 17/18] Updated the contraction code to make it compatible with half floats. --- .../CXX11/src/Tensor/TensorContractionCuda.h | 100 ++++++++++-------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index a5f3debc4..f5b539c7e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -99,23 +99,23 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, #define prefetchIntoRegisters(base_k) \ { \ - lhs_pf0 = Scalar(0); \ - lhs_pf1 = Scalar(0); \ - lhs_pf2 = Scalar(0); \ - lhs_pf3 = Scalar(0); \ - lhs_pf4 = Scalar(0); \ - lhs_pf5 = Scalar(0); \ - lhs_pf6 = Scalar(0); \ - lhs_pf7 = Scalar(0); \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ \ - rhs_pf0 = Scalar(0); \ - rhs_pf1 = Scalar(0); \ - rhs_pf2 = Scalar(0); \ - rhs_pf3 = Scalar(0); \ - rhs_pf4 = Scalar(0); \ - rhs_pf5 = Scalar(0); \ - rhs_pf6 = Scalar(0); \ - rhs_pf7 = Scalar(0); \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ \ if (!needs_edge_check || lhs_vert < m_size) { \ const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ @@ -261,15 +261,16 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // declare and initialize result array #define res(i, j) _res_##i##j #define initResultRow(i) \ - Scalar res(i, 0) = Scalar(0); \ - Scalar res(i, 1) = Scalar(0); \ - Scalar res(i, 2) = Scalar(0); \ - Scalar res(i, 3) = Scalar(0); \ - Scalar res(i, 4) = Scalar(0); \ - Scalar res(i, 5) = Scalar(0); \ - Scalar res(i, 6) = Scalar(0); \ - Scalar res(i, 7) = Scalar(0); \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + internal::scalar_cast_op conv; initResultRow(0); initResultRow(1); initResultRow(2); @@ -1313,6 +1314,34 @@ struct TensorEvaluator struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + template void evalTyped(Scalar* buffer) const { // columns in left side, rows in right side @@ -1353,28 +1382,7 @@ struct TensorEvaluator::value && - internal::is_same::value) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); - } - } else { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); - } + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); } }; From 46fc23f91c1c5ea21bab67976773c613bd7e4ab0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 19 Feb 2016 13:44:22 -0800 Subject: [PATCH 18/18] Print an error message to stderr when the initialization of the CUDA runtime fails. This helps debugging setup issues. --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 3808eb155..c01704e56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -34,12 +34,23 @@ static void initializeDeviceProp() { if (!m_devicePropInitialized) { int num_devices; cudaError_t status = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); + if (status != cudaSuccess) { + std::cerr << "Failed to get the number of CUDA devices: " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } m_deviceProperties = new cudaDeviceProp[num_devices]; for (int i = 0; i < num_devices; ++i) { status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - assert(status == cudaSuccess); + if (status != cudaSuccess) { + std::cerr << "Failed to initialize CUDA device #" + << i + << ": " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } } m_devicePropInitialized = true; }