From e7e6809e6b38a5928efc0b5ca9520258e4d1fb3a Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Mon, 19 Nov 2018 18:13:59 +0000 Subject: [PATCH] ROCm/HIP specfic fixes + updates 1. Eigen/src/Core/arch/GPU/Half.h Updating the HIPCC implementation half so that it can declared as a __shared__ variable 2. Eigen/src/Core/util/Macros.h, Eigen/src/Core/util/Memory.h introducing a EIGEN_USE_STD(func) macro that calls - std::func be default - ::func when eigen is being compiled with HIPCC This change was requested in the previous HIP PR (https://bitbucket.org/eigen/eigen/pull-requests/518/pr-with-hip-specific-fixes-for-the-eigen/diff) 3. unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h Removing EIGEN_DEVICE_FUNC attribute from pure virtual methods as it is not supported by HIPCC 4. unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h Disabling the template specializations of InnerMostDimReducer as they run into HIPCC link errors --- Eigen/src/Core/arch/GPU/Half.h | 8 ++- Eigen/src/Core/util/Macros.h | 13 +++++ Eigen/src/Core/util/Memory.h | 51 ++++++------------- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 5 files changed, 37 insertions(+), 41 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h index f87d8a18c..7873f8ec0 100644 --- a/Eigen/src/Core/arch/GPU/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -52,7 +52,9 @@ namespace half_impl { #if !defined(EIGEN_HAS_GPU_FP16) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { - EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + // The default constructor cannot initialize its member, otherwise the + // derived class Eigen::Half cannot be used as __shared__ variable in HIPCC. + EIGEN_DEVICE_FUNC __half_raw() {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; @@ -70,7 +72,9 @@ struct __half_raw { // so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make // that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine struct __half_raw { - EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + // The default constructor cannot initialize its member, otherwise the + // derived class Eigen::Half cannot be used as __shared__ variable in HIPCC. + EIGEN_DEVICE_FUNC __half_raw() {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} union { unsigned short x; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 9d277e26f..a7c6f50c3 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -896,6 +896,19 @@ namespace Eigen { #endif +// When compiling HIP device code with HIPCC, certain functions +// from the stdlib need to be pulled in from the global namespace +// (as opposed to from the std:: namespace). This is because HIPCC +// does not natively support all the std:: routines in device code. +// Instead it contains header files that declare the corresponding +// routines in the global namespace such they can be used in device code. +#if defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_USING_STD(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD(FUNC) using std::FUNC; +#endif + + #if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a135761d6..87b538658 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -99,12 +99,9 @@ inline void throw_std_bad_alloc() EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2"); - -#if defined(EIGEN_HIP_DEVICE_COMPILE) - void *original = ::malloc(size+alignment); -#else - void *original = std::malloc(size+alignment); -#endif + + EIGEN_USING_STD(malloc) + void *original = malloc(size+alignment); if (original == 0) return 0; void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); @@ -116,11 +113,8 @@ EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::si EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) { if (ptr) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - ::free(*(reinterpret_cast(ptr) - 1)); -#else - std::free(*(reinterpret_cast(ptr) - 1)); -#endif + EIGEN_USING_STD(free) + free(*(reinterpret_cast(ptr) - 1)); } } @@ -183,11 +177,8 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - #if defined(EIGEN_HIP_DEVICE_COMPILE) - result = ::malloc(size); - #else - result = std::malloc(size); - #endif + EIGEN_USING_STD(malloc) + result = malloc(size); #if EIGEN_DEFAULT_ALIGN_BYTES==16 eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); @@ -207,11 +198,8 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - #if defined(EIGEN_HIP_DEVICE_COMPILE) - ::free(ptr); - #else - std::free(ptr); - #endif + EIGEN_USING_STD(free) + free(ptr); #else handmade_aligned_free(ptr); @@ -256,11 +244,8 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); - #if defined(EIGEN_HIP_DEVICE_COMPILE) - void *result = ::malloc(size); - #else - void *result = std::malloc(size); - #endif + EIGEN_USING_STD(malloc) + void *result = malloc(size); if(!result && size) throw_std_bad_alloc(); @@ -275,11 +260,8 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - ::free(ptr); - #else - std::free(ptr); - #endif + EIGEN_USING_STD(free) + free(ptr); } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -540,11 +522,8 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - #if defined(EIGEN_HIP_DEVICE_COMPILE) - ::memcpy(target, start, size); - #else - std::memcpy(target, start, size); - #endif + EIGEN_USING_STD(memcpy) + memcpy(target, start, size); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 3b87b114d..bb330a77b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -45,8 +45,8 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { class Allocator { public: virtual ~Allocator() {} - EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0; - EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0; + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; }; // Build a thread pool device on top the an existing pool of threads. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index bda114751..50fa0cb2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -195,6 +195,7 @@ struct InnerMostDimReducer { } }; +#if !defined(EIGEN_HIPCC) static const int kLeafSize = 1024; template @@ -218,7 +219,6 @@ struct InnerMostDimReducer { } }; -#if !defined(EIGEN_HIPCC) template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType