Merged in deven-amd/eigen (pull request PR-402)

Adding support for using Eigen in HIP kernels.
This commit is contained in:
Gael Guennebaud 2018-07-12 08:07:16 +00:00
commit da0c604078
61 changed files with 1895 additions and 999 deletions

View File

@ -22,6 +22,17 @@
#define EIGEN_CUDA_ARCH __CUDA_ARCH__ #define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif #endif
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
// analogous to EIGEN_CUDACC, but for HIP
#define EIGEN_HIPCC __HIPCC__
#endif
// NVCC is not supported as the target platform for HIPCC
// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
#if defined(__NVCC__) && defined(__HIPCC__)
#error "NVCC as the target platform for HIPCC is currently not supported."
#endif
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available. // Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
@ -32,8 +43,8 @@
#endif #endif
// Handle NVCC/CUDA/SYCL // Handle NVCC/CUDA/SYCL
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) #if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC)
// Do not try asserts on CUDA and SYCL! // Do not try asserts on CUDA, HIP and SYCL!
#ifndef EIGEN_NO_DEBUG #ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG
#endif #endif
@ -71,6 +82,26 @@
#define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
#endif #endif
#endif #endif
#elif defined(EIGEN_HIPCC)
// Do not try to vectorize on HIP
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#define EIGEN_DEVICE_FUNC __host__ __device__
// We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <hip/hip_runtime.h>
#if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP)
// analogous to EIGEN_CUDA_ARCH, but for HIP
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
// Note this check needs to come after we include hip_runtime.h since
// hip_runtime.h includes hip_common.h which in turn has the define
// for __HIP_DEVICE_COMPILE__
#endif
#else #else
#define EIGEN_DEVICE_FUNC #define EIGEN_DEVICE_FUNC
#endif #endif
@ -81,16 +112,71 @@
#endif #endif
#endif #endif
// When compiling CUDA device code with NVCC, pull in math functions from the
// global namespace. In host mode, and when device doee with clang, use the #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
// std versions. //
#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__) // If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
//
#define EIGEN_GPUCC
//
// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
//
// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
// For those cases, the corresponding code should be guarded with
// #if defined(EIGEN_GPUCC)
// instead of
// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
//
// For cases where the tweak is specific to HIP, the code should be guarded with
// #if defined(EIGEN_HIPCC)
//
// For cases where the tweak is specific to CUDA, the code should be guarded with
// #if defined(EIGEN_CUDACC)
//
#endif
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
//
// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
//
#define EIGEN_GPU_COMPILE_PHASE
//
// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
// + one to compile the source for the "host" (ie CPU)
// + another to compile the source for the "device" (ie. GPU)
//
// Code that needs to enabled only during the either the "host" or "device" compilation phase
// needs to be guarded with a macro that indicates the current compilation phase
//
// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
//
// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
// For those cases, the code should be guarded with
// #if defined(EIGEN_GPU_COMPILE_PHASE)
// instead of
// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
//
// For cases where the tweak is specific to HIP, the code should be guarded with
// #if defined(EIGEN_HIP_DEVICE_COMPILE)
//
// For cases where the tweak is specific to CUDA, the code should be guarded with
// #if defined(EIGEN_CUDA_ARCH)
//
#endif
// When compiling CUDA device code with NVCC, or HIP device code with HIPCC
// pull in math functions from the global namespace. In host mode, and when
// device doee with clang, use the std versions.
#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__))
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else #else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif #endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) #if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_EXCEPTIONS #define EIGEN_EXCEPTIONS
#endif #endif
@ -271,7 +357,7 @@
#endif #endif
#if defined EIGEN_CUDACC #if defined EIGEN_CUDACC
#define EIGEN_VECTORIZE_CUDA #define EIGEN_VECTORIZE_GPU
#include <vector_types.h> #include <vector_types.h>
#if EIGEN_CUDACC_VER >= 70500 #if EIGEN_CUDACC_VER >= 70500
#define EIGEN_HAS_CUDA_FP16 #define EIGEN_HAS_CUDA_FP16
@ -283,6 +369,27 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif #endif
#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_VECTORIZE_GPU
#include <hip/hip_vector_types.h>
#define EIGEN_HAS_HIP_FP16
#include <hip/hip_fp16.h>
#define HIP_PATCH_WITH_NEW_FP16 18215
#if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16)
#define EIGEN_HAS_OLD_HIP_FP16
// Old HIP implementation does not have a explicit typedef for "half2"
typedef __half2 half2;
#endif
#endif
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
#define EIGEN_HAS_GPU_FP16
#endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
#define EIGEN_HAS_OPENMP #define EIGEN_HAS_OPENMP
#endif #endif
@ -403,7 +510,6 @@ using std::ptrdiff_t;
#include "src/Core/util/IntegralConstant.h" #include "src/Core/util/IntegralConstant.h"
#include "src/Core/util/SymbolicIndex.h" #include "src/Core/util/SymbolicIndex.h"
#include "src/Core/NumTraits.h" #include "src/Core/NumTraits.h"
#include "src/Core/MathFunctions.h" #include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h" #include "src/Core/GenericPacketMath.h"
@ -447,13 +553,13 @@ using std::ptrdiff_t;
#endif #endif
// Half float support // Half float support
#include "src/Core/arch/CUDA/Half.h" #include "src/Core/arch/GPU/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/GPU/PacketMathHalf.h"
#include "src/Core/arch/CUDA/TypeCasting.h" #include "src/Core/arch/GPU/TypeCasting.h"
#if defined EIGEN_VECTORIZE_CUDA #if defined EIGEN_VECTORIZE_GPU
#include "src/Core/arch/CUDA/PacketMath.h" #include "src/Core/arch/GPU/PacketMath.h"
#include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/GPU/MathFunctions.h"
#endif #endif
#include "src/Core/arch/Default/Settings.h" #include "src/Core/arch/Default/Settings.h"

View File

@ -35,7 +35,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category template<int Size, int MaxSize> struct product_size_category
{ {
enum { enum {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
is_large = MaxSize == Dynamic || is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),

View File

@ -303,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
/** \internal tries to do cache prefetching of \a addr */ /** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
{ {
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_HIP_DEVICE_COMPILE)
// do nothing
#elif defined(EIGEN_CUDA_ARCH)
#if defined(__LP64__) #if defined(__LP64__)
// 64-bit pointer operand constraint for inlined asm // 64-bit pointer operand constraint for inlined asm
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@ -530,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/ ***************************************************************************/
// Eigen+CUDA does not support complexes. // Eigen+CUDA does not support complexes.
#ifndef EIGEN_CUDACC #if !defined(EIGEN_GPUCC)
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

View File

@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {}; template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> template<typename T>
struct real_impl<std::complex<T> > struct real_impl<std::complex<T> >
{ {
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {}; template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> template<typename T>
struct imag_impl<std::complex<T> > struct imag_impl<std::complex<T> >
{ {
@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true>
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {}; template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> template<typename T>
struct conj_impl<std::complex<T> > struct conj_impl<std::complex<T> >
{ {
@ -435,7 +435,12 @@ struct round_retval
struct arg_impl { struct arg_impl {
static inline Scalar run(const Scalar& x) static inline Scalar run(const Scalar& x)
{ {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIP does not seem to have a native device side implementation for the math routine "arg"
using std::arg;
#else
EIGEN_USING_STD_MATH(arg); EIGEN_USING_STD_MATH(arg);
#endif
return arg(x); return arg(x);
} }
}; };
@ -768,7 +773,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x) isfinite_impl(const T& x)
{ {
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isfinite)(x); return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY #elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite; using std::isfinite;
@ -783,7 +788,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x) isinf_impl(const T& x)
{ {
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isinf)(x); return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY #elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf; using std::isinf;
@ -798,7 +803,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x) isnan_impl(const T& x)
{ {
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isnan)(x); return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY #elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan; using std::isnan;
@ -864,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext { namespace numext {
#if (!defined(EIGEN_CUDACC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__) #if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@ -977,7 +982,12 @@ template<>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
{ {
#if defined(EIGEN_HIPCC)
// no "fminl" on HIP yet
return (x < y) ? x : y;
#else
return fminl(x, y); return fminl(x, y);
#endif
} }
template<typename T> template<typename T>
@ -1002,7 +1012,12 @@ template<>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
{ {
#if defined(EIGEN_HIPCC)
// no "fmaxl" on HIP yet
return (x > y) ? x : y;
#else
return fmaxl(x, y); return fmaxl(x, y);
#endif
} }
#endif #endif
@ -1099,7 +1114,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); } float log1p(const float &x) { return ::log1pf(x); }
@ -1157,7 +1172,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); } float floor(const float &x) { return ::floorf(x); }
@ -1178,7 +1193,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); } float ceil(const float &x) { return ::ceilf(x); }
@ -1236,7 +1251,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); } float log(const float &x) { return ::logf(x); }
@ -1264,7 +1279,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); } float abs(const float &x) { return ::fabsf(x); }
@ -1294,7 +1309,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); } float exp(const float &x) { return ::expf(x); }
@ -1330,7 +1345,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); } float expm1(const float &x) { return ::expm1f(x); }
@ -1350,7 +1365,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); } float cos(const float &x) { return ::cosf(x); }
@ -1370,7 +1385,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); } float sin(const float &x) { return ::sinf(x); }
@ -1390,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); } float tan(const float &x) { return ::tanf(x); }
@ -1421,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); } float acos(const float &x) { return ::acosf(x); }
@ -1452,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); } float asin(const float &x) { return ::asinf(x); }
@ -1483,7 +1498,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); } float atan(const float &x) { return ::atanf(x); }
@ -1504,7 +1519,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); } float cosh(const float &x) { return ::coshf(x); }
@ -1524,7 +1539,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); } float sinh(const float &x) { return ::sinhf(x); }
@ -1542,12 +1557,12 @@ T tanh(const T &x) {
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH #elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); } float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif #endif
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); } float tanh(const float &x) { return ::tanhf(x); }
@ -1567,7 +1582,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC #if defined(EIGEN_GPUCC)
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) { float fmod(const float& a, const float& b) {

View File

@ -885,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
} }
#ifndef EIGEN_CUDACC #ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType> template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{ {
@ -929,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
} }
#ifndef EIGEN_CUDACC #ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType> template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{ {

View File

@ -26,15 +26,15 @@
// Standard 16-bit float type, mostly useful for GPUs. Defines a new // Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with // type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
// operator overloads such that it behaves basically as an arithmetic // operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay // type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O // in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs. // to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H #ifndef EIGEN_HALF_GPU_H
#define EIGEN_HALF_CUDA_H #define EIGEN_HALF_GPU_H
#if __cplusplus > 199711L #if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@ -49,16 +49,41 @@ struct half;
namespace half_impl { namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16) #if !defined(EIGEN_HAS_GPU_FP16)
// Make our own __half_raw definition that is similar to CUDA's. // Make our own __half_raw definition that is similar to CUDA's.
struct __half_raw { struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {} EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x; unsigned short x;
}; };
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #elif defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
// Make a __half_raw definition that is
// ++ compatible with that of Eigen and
// ++ add a implcit conversion to the native __half of the old HIP implementation.
//
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
//
// In the old HIP implementation,
// ++ __half is a typedef of __fp16
// ++ the "__h*" routines take "__half" arguments
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
union {
unsigned short x;
__half data;
};
operator __half(void) const { return data; }
};
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw; typedef __half __half_raw;
#endif
#endif #endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
@ -69,8 +94,19 @@ struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base() {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
#else
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
#endif #endif
}; };
@ -78,17 +114,38 @@ struct half_base : public __half_raw {
// Class definition. // Class definition.
struct half : public half_impl::half_base { struct half : public half_impl::half_base {
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
typedef half_impl::__half_raw __half_raw; // Writing this out as separate #if-else blocks to make the code easier to follow
#endif // The same applies to most #if-else blocks in this file
#if !defined(EIGEN_HAS_GPU_FP16)
typedef half_impl::__half_raw __half_raw;
#elif defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
typedef half_impl::__half_raw __half_raw;
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
typedef half_impl::__half_raw __half_raw;
#endif
#endif
EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half() {}
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
#endif
#endif #endif
explicit EIGEN_DEVICE_FUNC half(bool b) explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@ -201,7 +258,8 @@ namespace Eigen {
namespace half_impl { namespace half_impl {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
// Intrinsics for native fp16 support. Note that on current hardware, // Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2 // these are no faster than fp32 arithmetic (you need to use the half2
@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
#else // Emulate support for half floats #else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion // Definitions for CPUs and older HIP+CUDA, mostly working through conversion
// to/from fp32. // to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@ -342,7 +400,8 @@ union FP32 {
}; };
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
__half tmp_ff = __float2half(ff); __half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff; return *(__half_raw*)&tmp_ff;
@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(h); return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C) #elif defined(EIGEN_HAS_FP16_C)
@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00; return (a.x & 0x7fff) == 0x7c00;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hisnan(a); return __hisnan(a);
#else #else
return (a.x & 0x7fff) > 0x7c00; return (a.x & 0x7fff) > 0x7c00;
@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result; return result;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a)); return half(hexp(a));
#else #else
return half(::expf(float(a))); return half(::expf(float(a)));
@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a))); return half(numext::expm1(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return half(::hlog(a)); return half(::hlog(a));
#else #else
return half(::logf(float(a))); return half(::logf(float(a)));
@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a))); return half(::log10f(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a)); return half(hsqrt(a));
#else #else
return half(::sqrtf(float(a))); return half(::sqrtf(float(a)));
@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a))); return half(::tanhf(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a)); return half(hfloor(a));
#else #else
return half(::floorf(float(a))); return half(::floorf(float(a)));
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a)); return half(hceil(a));
#else #else
return half(::ceilf(float(a))); return half(::ceilf(float(a)));
@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(b, a) ? b : a; return __hlt(b, a) ? b : a;
#else #else
const float f1 = static_cast<float>(a); const float f1 = static_cast<float>(a);
@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(a, b) ? b : a; return __hlt(a, b) ? b : a;
#else #else
const float f1 = static_cast<float>(a); const float f1 = static_cast<float>(a);
@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a))); return Eigen::half(::expf(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return Eigen::half(::hlog(a)); return Eigen::half(::hlog(a));
#else #else
return Eigen::half(::logf(float(a))); return Eigen::half(::logf(float(a)));
@ -629,9 +698,12 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic // Add the missing shfl_xor intrinsic
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
#if EIGEN_CUDACC_VER < 90000 #if (EIGEN_CUDACC_VER < 90000) || \
defined(EIGEN_HAS_HIP_FP16)
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
#else #else
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
#endif #endif
// ldg() has an overload for __half_raw, but we also need one for Eigen::half. // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half( return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr))); __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif #endif
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
namespace Eigen { namespace Eigen {
namespace numext { namespace numext {
@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) {
} // namespace numext } // namespace numext
#endif #endif
#endif // EIGEN_HALF_CUDA_H #endif // EIGEN_HALF_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H #ifndef EIGEN_MATH_FUNCTIONS_GPU_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H #define EIGEN_MATH_FUNCTIONS_GPU_H
namespace Eigen { namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to // Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones // introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...) // we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a) float4 plog<float4>(const float4& a)
{ {
@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H #endif // EIGEN_MATH_FUNCTIONS_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H #ifndef EIGEN_PACKET_MATH_GPU_H
#define EIGEN_PACKET_MATH_CUDA_H #define EIGEN_PACKET_MATH_GPU_H
namespace Eigen { namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to // Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones // introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...) // we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; }; template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; }; template<> struct is_arithmetic<double2> { enum { value = true }; };
@ -338,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H #endif // EIGEN_PACKET_MATH_GPU_H

View File

@ -7,15 +7,16 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H #ifndef EIGEN_PACKET_MATH_HALF_GPU_H
#define EIGEN_PACKET_MATH_HALF_CUDA_H #define EIGEN_PACKET_MATH_HALF_GPU_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
// Most of the following operations require arch >= 3.0 // Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
template<> struct is_arithmetic<half2> { enum { value = true }; }; template<> struct is_arithmetic<half2> { enum { value = true }; };
@ -43,7 +44,18 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return half2half2(from);
#else
return __half2half2(from); return __half2half2(from);
#endif
#else // EIGEN_CUDA_ARCH
return __half2half2(from);
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
template<> template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return __halves2half2((*(from+0)), (*(from+1)));
#else
return __ldg((const half2*)from);
#endif
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350 #if EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from); return __ldg((const half2*)from);
#else #else
return __halves2half2(*(from+0), *(from+1)); return __halves2half2(*(from+0), *(from+1));
#endif #endif
#endif
} }
template<> template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return __halves2half2((*(from+0)), (*(from+1)));
#else
return __halves2half2(__ldg(from+0), __ldg(from+1));
#endif
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350 #if EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1)); return __halves2half2(__ldg(from+0), __ldg(from+1));
#else #else
return __halves2half2(*(from+0), *(from+1)); return __halves2half2(*(from+0), *(from+1));
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
@ -117,15 +155,29 @@ ptranspose(PacketBlock<half2,2>& kernel) {
} }
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f))); return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else #else
float f = __half2float(a) + 1.0f; float f = __half2float(a) + 1.0f;
return __halves2half2(a, __float2half(f)); return __halves2half2(a, __float2half(f));
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b); return __hadd2(a, b);
#else #else
@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
float r2 = a2 + b2; float r2 = a2 + b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hsub2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hsub2(a, b); return __hsub2(a, b);
#else #else
@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
float r2 = a2 - b2; float r2 = a2 - b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hneg2(a);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hneg2(a); return __hneg2(a);
#else #else
@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
float a2 = __high2float(a); float a2 = __high2float(a);
return __floats2half2_rn(-a1, -a2); return __floats2half2_rn(-a1, -a2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b); return __hmul2(a, b);
#else #else
@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
float r2 = a2 * b2; float r2 = a2 * b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hfma2(a, b, c);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hfma2(a, b, c); return __hfma2(a, b, c);
#else #else
@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
float r2 = a2 * b2 + c2; float r2 = a2 * b2 + c2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return h2div(a, b);
#else
return __h2div(a, b);
#endif
#else // EIGEN_CUDA_ARCH
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
float r1 = a1 / b1; float r1 = a1 / b1;
float r2 = a2 / b2; float r2 = a2 / b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hadd(__low2half(a), __high2half(a)); return __hadd(__low2half(a), __high2half(a));
#else #else
@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
float a2 = __high2float(a); float a2 = __high2float(a);
return Eigen::half(__float2half(a1 + a2)); return Eigen::half(__float2half(a1 + a2));
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a); __half first = __low2half(a);
__half second = __high2half(a); __half second = __high2half(a);
@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
float a2 = __high2float(a); float a2 = __high2float(a);
return a1 > a2 ? __low2half(a) : __high2half(a); return a1 > a2 ? __low2half(a) : __high2half(a);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a); __half first = __low2half(a);
__half second = __high2half(a); __half second = __high2half(a);
@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
float a2 = __high2float(a); float a2 = __high2float(a);
return a1 < a2 ? __low2half(a) : __high2half(a); return a1 < a2 ? __low2half(a) : __high2half(a);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530 #if EIGEN_CUDA_ARCH >= 530
return __hmul(__low2half(a), __high2half(a)); return __hmul(__low2half(a), __high2half(a));
#else #else
@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
float a2 = __high2float(a); float a2 = __high2float(a);
return Eigen::half(__float2half(a1 * a2)); return Eigen::half(__float2half(a1 * a2));
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
template<> __device__ EIGEN_STRONG_INLINE template<> __device__ EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) { half2 plog<half2>(const half2& a) {
@ -1281,4 +1416,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
} }
} }
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H #endif // EIGEN_PACKET_MATH_HALF_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_CUDA_H #ifndef EIGEN_TYPE_CASTING_GPU_H
#define EIGEN_TYPE_CASTING_CUDA_H #define EIGEN_TYPE_CASTING_GPU_H
namespace Eigen { namespace Eigen {
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type; typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(a); return __float2half(a);
#else #else
return Eigen::half(a); return Eigen::half(a);
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type; typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(static_cast<float>(a)); return __float2half(static_cast<float>(a));
#else #else
return Eigen::half(static_cast<float>(a)); return Eigen::half(static_cast<float>(a));
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef float result_type; typedef float result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(a); return __half2float(a);
#else #else
return static_cast<float>(a); return static_cast<float>(a);
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
template <> template <>
struct type_casting_traits<Eigen::half, float> { struct type_casting_traits<Eigen::half, float> {
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_CUDA_H #endif // EIGEN_TYPE_CASTING_GPU_H

View File

@ -0,0 +1,23 @@
/*
* math_constants.h -
* HIP equivalent of the CUDA header of the same name
*/
#ifndef __MATH_CONSTANTS_H__
#define __MATH_CONSTANTS_H__
/* single precision constants */
#define HIPRT_INF_F __int_as_float(0x7f800000)
#define HIPRT_NAN_F __int_as_float(0x7fffffff)
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
#define HIPRT_ZERO_F 0.0f
#define HIPRT_ONE_F 1.0f
/* double precision constants */
#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
#endif

View File

@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
{ {
#ifdef EIGEN_CUDACC #ifdef EIGEN_GPUCC
// FIXME is there some kind of cuda::swap? // FIXME is there some kind of cuda::swap?
Scalar t=b; const_cast<Scalar&>(b)=a; a=t; Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
#else #else

View File

@ -436,7 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type; typedef typename BinaryOp::result_type result_type;
bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
@ -455,7 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type; typedef typename BinaryOp::result_type result_type;
bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }

View File

@ -48,7 +48,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run( EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
Index rows, Index cols, Index rows, Index cols,
const LhsMapper& lhs, const LhsMapper& lhs,
const RhsMapper& rhs, const RhsMapper& rhs,
@ -57,7 +57,7 @@ EIGEN_DONT_INLINE static void run(
}; };
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols, Index rows, Index cols,
const LhsMapper& alhs, const LhsMapper& alhs,
const RhsMapper& rhs, const RhsMapper& rhs,
@ -231,7 +231,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run( EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
Index rows, Index cols, Index rows, Index cols,
const LhsMapper& lhs, const LhsMapper& lhs,
const RhsMapper& rhs, const RhsMapper& rhs,
@ -240,7 +240,7 @@ EIGEN_DONT_INLINE static void run(
}; };
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols, Index rows, Index cols,
const LhsMapper& alhs, const LhsMapper& alhs,
const RhsMapper& rhs, const RhsMapper& rhs,

View File

@ -1008,9 +1008,12 @@ namespace Eigen {
# define EIGEN_TRY try # define EIGEN_TRY try
# define EIGEN_CATCH(X) catch (X) # define EIGEN_CATCH(X) catch (X)
#else #else
# ifdef EIGEN_CUDA_ARCH # if defined(EIGEN_CUDA_ARCH)
# define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW_X(X) asm("trap;")
# define EIGEN_THROW asm("trap;") # define EIGEN_THROW asm("trap;")
# elif defined(EIGEN_HIP_DEVICE_COMPILE)
# define EIGEN_THROW_X(X) asm("s_trap 0")
# define EIGEN_THROW asm("s_trap 0")
# else # else
# define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW_X(X) std::abort()
# define EIGEN_THROW std::abort() # define EIGEN_THROW std::abort()

View File

@ -70,7 +70,20 @@ inline void throw_std_bad_alloc()
throw std::bad_alloc(); throw std::bad_alloc();
#else #else
std::size_t huge = static_cast<std::size_t>(-1); std::size_t huge = static_cast<std::size_t>(-1);
#if defined(EIGEN_HIPCC)
//
// calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
// and as a consequence the code in the #else block triggers the hipcc warning :
// "no overloaded function has restriction specifiers that are compatible with the ambient context"
//
// "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
// the same on "operator new"
// Reverting code back to the old version in this #if block for the hipcc compiler
//
new int[huge];
#else
::operator new(huge); ::operator new(huge);
#endif
#endif #endif
} }
@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
void *result; void *result;
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
#if defined(EIGEN_HIP_DEVICE_COMPILE)
result = ::malloc(size);
#else
result = std::malloc(size); result = std::malloc(size);
#endif
#if EIGEN_DEFAULT_ALIGN_BYTES==16 #if EIGEN_DEFAULT_ALIGN_BYTES==16
eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
#endif #endif
@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
{ {
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
#if defined(EIGEN_HIP_DEVICE_COMPILE)
::free(ptr);
#else
std::free(ptr); std::free(ptr);
#endif
#else #else
handmade_aligned_free(ptr); handmade_aligned_free(ptr);
#endif #endif
@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
{ {
check_that_malloc_is_allowed(); check_that_malloc_is_allowed();
#if defined(EIGEN_HIP_DEVICE_COMPILE)
void *result = ::malloc(size);
#else
void *result = std::malloc(size); void *result = std::malloc(size);
#endif
if(!result && size) if(!result && size)
throw_std_bad_alloc(); throw_std_bad_alloc();
return result; return result;
@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr) template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
{ {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
::free(ptr);
#else
std::free(ptr); std::free(ptr);
#endif
} }
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> {
IntPtr size = IntPtr(end)-IntPtr(start); IntPtr size = IntPtr(end)-IntPtr(start);
if(size==0) return; if(size==0) return;
eigen_internal_assert(start!=0 && end!=0 && target!=0); eigen_internal_assert(start!=0 && end!=0 && target!=0);
#if defined(EIGEN_HIP_DEVICE_COMPILE)
::memcpy(target, start, size);
#else
std::memcpy(target, start, size); std::memcpy(target, start, size);
#endif
} }
}; };

View File

@ -11,9 +11,18 @@
#ifndef EIGEN_META_H #ifndef EIGEN_META_H
#define EIGEN_META_H #define EIGEN_META_H
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
#include <cfloat>
#include <math_constants.h> #include <cfloat>
#if defined(EIGEN_CUDA_ARCH)
#include <math_constants.h>
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
#endif
#endif #endif
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
@ -177,7 +186,7 @@ template<bool Condition, typename T=void> struct enable_if;
template<typename T> struct enable_if<true,T> template<typename T> struct enable_if<true,T>
{ typedef T type; }; { typedef T type; };
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
#if !defined(__FLT_EPSILON__) #if !defined(__FLT_EPSILON__)
#define __FLT_EPSILON__ FLT_EPSILON #define __FLT_EPSILON__ FLT_EPSILON
#define __DBL_EPSILON__ DBL_EPSILON #define __DBL_EPSILON__ DBL_EPSILON
@ -199,13 +208,31 @@ template<> struct numeric_limits<float>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static float epsilon() { return __FLT_EPSILON__; } static float epsilon() { return __FLT_EPSILON__; }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static float (max)() { return CUDART_MAX_NORMAL_F; } static float (max)() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_MAX_NORMAL_F;
#else
return HIPRT_MAX_NORMAL_F;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static float (min)() { return FLT_MIN; } static float (min)() { return FLT_MIN; }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static float infinity() { return CUDART_INF_F; } static float infinity() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_INF_F;
#else
return HIPRT_INF_F;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static float quiet_NaN() { return CUDART_NAN_F; } static float quiet_NaN() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_NAN_F;
#else
return HIPRT_NAN_F;
#endif
}
}; };
template<> struct numeric_limits<double> template<> struct numeric_limits<double>
{ {
@ -216,9 +243,21 @@ template<> struct numeric_limits<double>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static double (min)() { return DBL_MIN; } static double (min)() { return DBL_MIN; }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static double infinity() { return CUDART_INF; } static double infinity() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_INF;
#else
return HIPRT_INF;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static double quiet_NaN() { return CUDART_NAN; } static double quiet_NaN() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_NAN;
#else
return HIPRT_NAN;
#endif
}
}; };
template<> struct numeric_limits<int> template<> struct numeric_limits<int>
{ {
@ -531,13 +570,13 @@ template<typename T, typename U> struct scalar_product_traits
namespace numext { namespace numext {
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
#else #else
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
#endif #endif
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
using internal::device::numeric_limits; using internal::device::numeric_limits;
#else #else
using std::numeric_limits; using std::numeric_limits;
@ -557,7 +596,7 @@ T div_ceil(const T &a, const T &b)
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool equal_strict(const X& x,const Y& y) { return x == y; } bool equal_strict(const X& x,const Y& y) { return x == y; }
#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) #if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); } bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
@ -568,7 +607,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const X& x,const Y& y) { return x != y; } bool not_equal_strict(const X& x,const Y& y) { return x != y; }
#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) #if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); } bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }

View File

@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
#endif #endif
}//end deflation }//end deflation
#ifndef EIGEN_CUDACC #if !defined(EIGEN_GPUCC)
/** \svd_module /** \svd_module
* *
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm

View File

@ -19,7 +19,10 @@ macro(ei_add_test_internal testname testname_with_suffix)
endif() endif()
if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
if(EIGEN_TEST_CUDA_CLANG) if(EIGEN_TEST_HIP)
hip_reset_flags()
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
elseif(EIGEN_TEST_CUDA_CLANG)
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
if(CUDA_64_BIT_DEVICE_CODE) if(CUDA_64_BIT_DEVICE_CODE)
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
@ -491,6 +494,11 @@ macro(ei_testing_print_summary)
else() else()
message(STATUS "CUDA: OFF") message(STATUS "CUDA: OFF")
endif() endif()
if(EIGEN_TEST_HIP)
message(STATUS "HIP: ON (using hipcc)")
else()
message(STATUS "HIP: OFF")
endif()
endif() # vectorization / alignment options endif() # vectorization / alignment options

View File

@ -399,7 +399,7 @@ if(CUDA_FOUND)
cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR}) cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(cuda_basic) ei_add_test(gpu_basic)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
@ -408,6 +408,48 @@ endif(CUDA_FOUND)
endif(EIGEN_TEST_CUDA) endif(EIGEN_TEST_CUDA)
# HIP unit tests
option(EIGEN_TEST_HIP "Add HIP support." OFF)
if (EIGEN_TEST_HIP)
set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
if (EXISTS ${HIP_PATH})
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
find_package(HIP REQUIRED)
if (HIP_FOUND)
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
if (${HIP_PLATFORM} STREQUAL "hcc")
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${HIP_PATH}/include)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(gpu_basic)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
elseif (${HIP_PLATFORM} STREQUAL "nvcc")
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
else ()
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
endif()
endif(HIP_FOUND)
else ()
message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
endif()
endif(EIGEN_TEST_HIP)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)
add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)

View File

@ -15,13 +15,11 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cuda_basic #define EIGEN_TEST_FUNC gpu_basic
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include <math_constants.h>
#include <cuda.h>
#include "main.h" #include "main.h"
#include "cuda_common.h" #include "gpu_common.h"
// Check that dense modules can be properly parsed by nvcc // Check that dense modules can be properly parsed by nvcc
#include <Eigen/Dense> #include <Eigen/Dense>
@ -164,40 +162,51 @@ struct matrix_inverse {
} }
}; };
void test_cuda_basic() void test_gpu_basic()
{ {
ei_test_init_cuda(); ei_test_init_gpu();
int nthreads = 100; int nthreads = 100;
Eigen::VectorXf in, out; Eigen::VectorXf in, out;
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
int data_size = nthreads * 512; int data_size = nthreads * 512;
in.setRandom(data_size); in.setRandom(data_size);
out.setRandom(data_size); out.setRandom(data_size);
#endif #endif
CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Vector3f>(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Array44f>(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array33f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(redux<Array4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(redux<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix2f>(), nthreads, in, out) ); #if !defined(EIGEN_USE_HIP)
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix3f>(), nthreads, in, out) ); // FIXME
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix4f>(), nthreads, in, out) ); // These subtests result in a compile failure on the HIP platform
//
// eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
// base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
// (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
#endif
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix3f>(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix2f>(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
#if !defined(EIGEN_USE_HIP)
// FIXME
// These subtests result in a linking error on the HIP platform
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
#endif
} }

View File

@ -1,13 +1,22 @@
#ifndef EIGEN_TEST_CUDA_COMMON_H #ifndef EIGEN_TEST_GPU_COMMON_H
#define EIGEN_TEST_CUDA_COMMON_H #define EIGEN_TEST_GPU_COMMON_H
#ifdef EIGEN_USE_HIP
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#else
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <iostream> #include <iostream>
#ifndef __CUDACC__ #define EIGEN_USE_GPU
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
#if !defined(__CUDACC__) && !defined(__HIPCC__)
dim3 threadIdx, blockDim, blockIdx; dim3 threadIdx, blockDim, blockIdx;
#endif #endif
@ -21,7 +30,7 @@ void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
template<typename Kernel, typename Input, typename Output> template<typename Kernel, typename Input, typename Output>
__global__ __global__
void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
{ {
int i = threadIdx.x + blockIdx.x*blockDim.x; int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i<n) { if(i<n) {
@ -31,61 +40,70 @@ void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* o
template<typename Kernel, typename Input, typename Output> template<typename Kernel, typename Input, typename Output>
void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out) void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
{ {
typename Input::Scalar* d_in; typename Input::Scalar* d_in;
typename Output::Scalar* d_out; typename Output::Scalar* d_out;
std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar);
std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice);
// Simple and non-optimal 1D mapping assuming n is not too large // Simple and non-optimal 1D mapping assuming n is not too large
// That's only for unit testing! // That's only for unit testing!
dim3 Blocks(128); dim3 Blocks(128);
dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
cudaThreadSynchronize(); gpuDeviceSynchronize();
run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
cudaThreadSynchronize(); #ifdef EIGEN_USE_HIP
hipLaunchKernelGGL(run_on_gpu_meta_kernel<Kernel,
typename std::decay<decltype(*d_in)>::type,
typename std::decay<decltype(*d_out)>::type>,
dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
#else
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
#endif
gpuDeviceSynchronize();
// check inputs have not been modified // check inputs have not been modified
cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out); gpuFree(d_out);
} }
template<typename Kernel, typename Input, typename Output> template<typename Kernel, typename Input, typename Output>
void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out) void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out)
{ {
Input in_ref, in_cuda; Input in_ref, in_gpu;
Output out_ref, out_cuda; Output out_ref, out_gpu;
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
in_ref = in_cuda = in; in_ref = in_gpu = in;
out_ref = out_cuda = out; out_ref = out_gpu = out;
#endif #endif
run_on_cpu (ker, n, in_ref, out_ref); run_on_cpu (ker, n, in_ref, out_ref);
run_on_cuda(ker, n, in_cuda, out_cuda); run_on_gpu(ker, n, in_gpu, out_gpu);
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
VERIFY_IS_APPROX(in_ref, in_cuda); VERIFY_IS_APPROX(in_ref, in_gpu);
VERIFY_IS_APPROX(out_ref, out_cuda); VERIFY_IS_APPROX(out_ref, out_gpu);
#endif #endif
} }
void ei_test_init_cuda() void ei_test_init_gpu()
{ {
int device = 0; int device = 0;
cudaDeviceProp deviceProp; gpuDeviceProp_t deviceProp;
cudaGetDeviceProperties(&deviceProp, device); gpuGetDeviceProperties(&deviceProp, device);
std::cout << "CUDA device info:\n"; std::cout << "GPU device info:\n";
std::cout << " name: " << deviceProp.name << "\n"; std::cout << " name: " << deviceProp.name << "\n";
std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n";
std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n";
@ -98,4 +116,4 @@ void ei_test_init_cuda()
std::cout << " computeMode: " << deviceProp.computeMode << "\n"; std::cout << " computeMode: " << deviceProp.computeMode << "\n";
} }
#endif // EIGEN_TEST_CUDA_COMMON_H #endif // EIGEN_TEST_GPU_COMMON_H

View File

@ -9,7 +9,7 @@
#include "main.h" #include "main.h"
#include <Eigen/src/Core/arch/CUDA/Half.h> #include <Eigen/src/Core/arch/GPU/Half.h>
// Make sure it's possible to forward declare Eigen::half // Make sure it's possible to forward declare Eigen::half
namespace Eigen { namespace Eigen {

View File

@ -67,11 +67,27 @@
// protected by parenthesis against macro expansion, the min()/max() macros // protected by parenthesis against macro expansion, the min()/max() macros
// are defined here and any not-parenthesized min/max call will cause a // are defined here and any not-parenthesized min/max call will cause a
// compiler error. // compiler error.
#define min(A,B) please_protect_your_min_with_parentheses #if !defined(__HIPCC__)
#define max(A,B) please_protect_your_max_with_parentheses //
#define isnan(X) please_protect_your_isnan_with_parentheses // HIP header files include the following files
#define isinf(X) please_protect_your_isinf_with_parentheses // <thread>
#define isfinite(X) please_protect_your_isfinite_with_parentheses // <regex>
// <unordered_map>
// which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail
//
// Including those header files before the following macro definition for "min" / "max", only partially resolves the issue
// This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other
// headers.
//
// So instead choosing to simply disable this check for HIP
//
#define min(A,B) please_protect_your_min_with_parentheses
#define max(A,B) please_protect_your_max_with_parentheses
#define isnan(X) please_protect_your_isnan_with_parentheses
#define isinf(X) please_protect_your_isinf_with_parentheses
#define isfinite(X) please_protect_your_isfinite_with_parentheses
#endif
#ifdef M_PI #ifdef M_PI
#undef M_PI #undef M_PI
#endif #endif
@ -154,7 +170,7 @@ namespace Eigen
#define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "") #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "")
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) #if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
#define EIGEN_EXCEPTIONS #define EIGEN_EXCEPTIONS
#endif #endif
@ -233,7 +249,7 @@ namespace Eigen
} }
#endif //EIGEN_EXCEPTIONS #endif //EIGEN_EXCEPTIONS
#elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS #elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS
// see bug 89. The copy_bool here is working around a bug in gcc <= 4.3 // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
#define eigen_assert(a) \ #define eigen_assert(a) \
if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@ -290,7 +306,7 @@ namespace Eigen
std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
#endif #endif
#if !defined(__CUDACC__) #if !defined(__CUDACC__) && !defined(__HIPCC__)
#define EIGEN_USE_CUSTOM_ASSERT #define EIGEN_USE_CUSTOM_ASSERT
#endif #endif

View File

@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t;
#endif #endif
#ifdef EIGEN_USE_GPU #ifdef EIGEN_USE_GPU
#include <iostream> #include <iostream>
#include <cuda_runtime.h> #if defined(EIGEN_USE_HIP)
#if __cplusplus >= 201103L #include <hip/hip_runtime.h>
#include <atomic> #else
#include <unistd.h> #include <cuda_runtime.h>
#endif #endif
#if __cplusplus >= 201103L
#include <atomic>
#include <unistd.h>
#endif
#endif #endif
#include "src/Tensor/TensorMacros.h" #include "src/Tensor/TensorMacros.h"
@ -95,7 +99,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h" #include "src/Tensor/TensorDeviceThreadPool.h"
#include "src/Tensor/TensorDeviceCuda.h" #include "src/Tensor/TensorDeviceGpu.h"
#include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorDeviceSycl.h"
#include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorIndexList.h"
#include "src/Tensor/TensorDimensionList.h" #include "src/Tensor/TensorDimensionList.h"
@ -112,14 +116,14 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h" #include "src/Tensor/TensorReduction.h"
#include "src/Tensor/TensorReductionCuda.h" #include "src/Tensor/TensorReductionGpu.h"
#include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionThreadPool.h"
#include "src/Tensor/TensorContractionCuda.h" #include "src/Tensor/TensorContractionGpu.h"
#include "src/Tensor/TensorConversion.h" #include "src/Tensor/TensorConversion.h"
#include "src/Tensor/TensorConvolution.h" #include "src/Tensor/TensorConvolution.h"
#include "src/Tensor/TensorFFT.h" #include "src/Tensor/TensorFFT.h"

View File

@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase
} }
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { #if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
void evalGemv(Scalar* buffer) const {
const Index rows = m_i_size; const Index rows = m_i_size;
const Index cols = m_k_size; const Index cols = m_k_size;
@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase
} }
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { #if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
void evalGemm(Scalar* buffer) const {
#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
if (m_can_use_xsmm) { if (m_can_use_xsmm) {
evalGemmXSMM(buffer); evalGemmXSMM(buffer);

View File

@ -28,7 +28,24 @@ class TensorContractionBlocking {
typedef typename LhsMapper::Scalar LhsScalar; typedef typename LhsMapper::Scalar LhsScalar;
typedef typename RhsMapper::Scalar RhsScalar; typedef typename RhsMapper::Scalar RhsScalar;
EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : /*
adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
(else HIPCC will error out)
However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
results in NVCC erroring out with the following error
../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
*/
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
kc_(k), mc_(m), nc_(n) kc_(k), mc_(m), nc_(n)
{ {
if (ShardingType == ShardByCol) { if (ShardingType == ShardByCol) {

View File

@ -9,10 +9,10 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
namespace Eigen { namespace Eigen {
@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
// the sum across all big k blocks of the product of little k block of index (x, y) // the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce // with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation. // the 8 threads over y by summation.
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
#else #else
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@ -503,7 +503,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
template<typename Scalar, typename Index, typename LhsMapper, template<typename Scalar, typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper> typename RhsMapper, typename OutputMapper>
__global__ void __global__ void
#if defined(EIGEN_HIPCC)
__launch_bounds__(512, 1)
#else
__launch_bounds__(512) __launch_bounds__(512)
#endif
EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) { const Index m_size, const Index n_size, const Index k_size) {
@ -542,7 +546,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
results[i].x = results[i].y = results[i].z = results[i].w = 0; results[i].x = results[i].y = results[i].z = results[i].w = 0;
} }
#define prefetch_lhs(reg, row, col) \ #define prefetch_lhs(reg, row, col) \
if (!CHECK_LHS_BOUNDARY) { \ if (!CHECK_LHS_BOUNDARY) { \
if (col < k_size) { \ if (col < k_size) { \
@ -563,12 +566,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
reg.x =lhs(row + 0, col); \ reg.x =lhs(row + 0, col); \
} \ } \
} \ } \
} \ } \
Index lhs_vert = base_m+threadIdx.x*4; Index lhs_vert = base_m+threadIdx.x*4;
for (Index k = 0; k < k_size; k += 16) { for (Index k = 0; k < k_size; k += 16) {
lhs_pf0 = internal::pset1<float4>(0); lhs_pf0 = internal::pset1<float4>(0);
rhs_pf0 = internal::pset1<float4>(0); rhs_pf0 = internal::pset1<float4>(0);
@ -618,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
x1 = rhs_pf0.x; x1 = rhs_pf0.x;
x2 = rhs_pf0.z; x2 = rhs_pf0.z;
} }
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
x1 = __shfl_xor(x1, 4); x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4); x2 = __shfl_xor(x2, 4);
#else #else
@ -695,7 +698,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
#undef prefetch_lhs #undef prefetch_lhs
#undef add_vals #undef add_vals
Index horiz_base = threadIdx.y*4+base_n; Index horiz_base = threadIdx.y*4+base_n;
if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
@ -784,7 +787,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
results[i].x = results[i].y = results[i].z = results[i].w = 0; results[i].x = results[i].y = results[i].z = results[i].w = 0;
} }
Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
for (Index k = 0; k < k_size; k += 32) { for (Index k = 0; k < k_size; k += 32) {
lhs_pf0 = internal::pset1<float4>(0); lhs_pf0 = internal::pset1<float4>(0);
@ -1069,7 +1071,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
__syncthreads(); __syncthreads();
} // end loop over k } // end loop over k
__syncthreads(); __syncthreads();
Index horiz_base = (threadIdx.y/4)*8+base_n; Index horiz_base = (threadIdx.y/4)*8+base_n;
if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
@ -1134,7 +1135,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
template<typename Index, typename LhsMapper, template<typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper> typename RhsMapper, typename OutputMapper>
__global__ void __global__ void
#if defined(EIGEN_HIPCC)
__launch_bounds__(256, 1)
#else
__launch_bounds__(256) __launch_bounds__(256)
#endif
EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) { const Index m_size, const Index n_size, const Index k_size) {
@ -1177,7 +1182,11 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
template<typename Index, typename LhsMapper, template<typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper> typename RhsMapper, typename OutputMapper>
__global__ void __global__ void
#if defined(EIGEN_HIPCC)
__launch_bounds__(256, 1)
#else
__launch_bounds__(256) __launch_bounds__(256)
#endif
EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) { const Index m_size, const Index n_size, const Index k_size) {
@ -1323,7 +1332,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index n_blocks = (n + 63) / 64; const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(8, 8, 8); const dim3 block_size(8, 8, 8);
LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
} }
}; };
@ -1334,13 +1343,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index n_blocks = (n + 63) / 64; const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(16, 16, 1); const dim3 block_size(16, 16, 1);
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
} else { } else {
const Index m_blocks = (m + 127) / 128; const Index m_blocks = (m + 127) / 128;
const Index n_blocks = (n + 63) / 64; const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(8, 32, 1); const dim3 block_size(8, 32, 1);
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
} }
} }
}; };
@ -1384,12 +1393,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
OutputMapper output(buffer, m); OutputMapper output(buffer, m);
setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); #if defined(EIGEN_USE_HIP)
setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
#else
setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
#endif
LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device); LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
} }
}; };
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_USE_GPU and EIGEN_CUDACC #endif // EIGEN_USE_GPU and EIGEN_GPUCC
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H

View File

@ -54,8 +54,8 @@ class IndexMapper {
} }
} }
array<Index, NumDims> cudaInputDimensions; array<Index, NumDims> gpuInputDimensions;
array<Index, NumDims> cudaOutputDimensions; array<Index, NumDims> gpuOutputDimensions;
array<Index, NumDims> tmp = dimensions; array<Index, NumDims> tmp = dimensions;
array<Index, NumDims> ordering; array<Index, NumDims> ordering;
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@ -65,8 +65,8 @@ class IndexMapper {
const Index index = i + offset; const Index index = i + offset;
ordering[index] = indices[i]; ordering[index] = indices[i];
tmp[indices[i]] = -1; tmp[indices[i]] = -1;
cudaInputDimensions[index] = input_dims[indices[i]]; gpuInputDimensions[index] = input_dims[indices[i]];
cudaOutputDimensions[index] = dimensions[indices[i]]; gpuOutputDimensions[index] = dimensions[indices[i]];
} }
int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@ -75,8 +75,8 @@ class IndexMapper {
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
if (tmp[i] >= 0) { if (tmp[i] >= 0) {
ordering[written] = i; ordering[written] = i;
cudaInputDimensions[written] = input_dims[i]; gpuInputDimensions[written] = input_dims[i];
cudaOutputDimensions[written] = dimensions[i]; gpuOutputDimensions[written] = dimensions[i];
++written; ++written;
} }
} }
@ -89,37 +89,37 @@ class IndexMapper {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
if (i > NumKernelDims) { if (i > NumKernelDims) {
m_cudaInputStrides[i] = m_gpuInputStrides[i] =
m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
m_cudaOutputStrides[i] = m_gpuOutputStrides[i] =
m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
} else { } else {
m_cudaInputStrides[i] = 1; m_gpuInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1; m_gpuOutputStrides[i] = 1;
} }
} }
} else { } else {
for (int i = NumDims - 1; i >= 0; --i) { for (int i = NumDims - 1; i >= 0; --i) {
if (static_cast<size_t>(i + 1) < offset) { if (static_cast<size_t>(i + 1) < offset) {
m_cudaInputStrides[i] = m_gpuInputStrides[i] =
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
m_cudaOutputStrides[i] = m_gpuOutputStrides[i] =
m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
} else { } else {
m_cudaInputStrides[i] = 1; m_gpuInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1; m_gpuOutputStrides[i] = 1;
} }
} }
} }
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
Index inputIndex = 0; Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) { for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaInputStrides[d]; const Index idx = p / m_gpuInputStrides[d];
inputIndex += idx * m_inputStrides[d]; inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d]; p -= idx * m_gpuInputStrides[d];
} }
inputIndex += p * m_inputStrides[NumKernelDims]; inputIndex += p * m_inputStrides[NumKernelDims];
} else { } else {
@ -128,22 +128,22 @@ class IndexMapper {
limit = NumDims - NumKernelDims - 1; limit = NumDims - NumKernelDims - 1;
} }
for (int d = 0; d < limit; ++d) { for (int d = 0; d < limit; ++d) {
const Index idx = p / m_cudaInputStrides[d]; const Index idx = p / m_gpuInputStrides[d];
inputIndex += idx * m_inputStrides[d]; inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d]; p -= idx * m_gpuInputStrides[d];
} }
inputIndex += p * m_inputStrides[limit]; inputIndex += p * m_inputStrides[limit];
} }
return inputIndex; return inputIndex;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
Index outputIndex = 0; Index outputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) { for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaOutputStrides[d]; const Index idx = p / m_gpuOutputStrides[d];
outputIndex += idx * m_outputStrides[d]; outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d]; p -= idx * m_gpuOutputStrides[d];
} }
outputIndex += p * m_outputStrides[NumKernelDims]; outputIndex += p * m_outputStrides[NumKernelDims];
} else { } else {
@ -152,44 +152,44 @@ class IndexMapper {
limit = NumDims - NumKernelDims - 1; limit = NumDims - NumKernelDims - 1;
} }
for (int d = 0; d < limit; ++d) { for (int d = 0; d < limit; ++d) {
const Index idx = p / m_cudaOutputStrides[d]; const Index idx = p / m_gpuOutputStrides[d];
outputIndex += idx * m_outputStrides[d]; outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d]; p -= idx * m_gpuOutputStrides[d];
} }
outputIndex += p * m_outputStrides[limit]; outputIndex += p * m_outputStrides[limit];
} }
return outputIndex; return outputIndex;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_inputStrides[offset]; return i * m_inputStrides[offset];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_outputStrides[offset]; return i * m_outputStrides[offset];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
@ -197,7 +197,7 @@ class IndexMapper {
k * m_inputStrides[offset + 2]; k * m_inputStrides[offset + 2];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
@ -209,8 +209,8 @@ class IndexMapper {
static const int NumDims = internal::array_size<InputDims>::value; static const int NumDims = internal::array_size<InputDims>::value;
array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_cudaInputStrides; array<Index, NumDims> m_gpuInputStrides;
array<Index, NumDims> m_cudaOutputStrides; array<Index, NumDims> m_gpuOutputStrides;
}; };
@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// Use an optimized implementation of the evaluation code for GPUs whenever possible. // Use an optimized implementation of the evaluation code for GPUs whenever possible.
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
template <int StaticKernelSize> template <int StaticKernelSize>
struct GetKernelSize { struct GetKernelSize {
@ -576,8 +576,12 @@ __global__ void EigenConvolutionKernel1D(
indexMapper, indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX, const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) { const int maxX, const int kernelSize, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[]; extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
@ -588,18 +592,18 @@ __global__ void EigenConvolutionKernel1D(
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
// Load inputs to shared memory // Load inputs to shared memory
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.y * num_x_input; const int plane_kernel_offset = threadIdx.y * num_x_input;
#pragma unroll #pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
s[i + plane_kernel_offset] = eval.coeff(tensor_index); s[i + plane_kernel_offset] = eval.coeff(tensor_index);
} }
__syncthreads(); __syncthreads();
// Compute the convolution // Compute the convolution
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll #pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@ -609,7 +613,7 @@ __global__ void EigenConvolutionKernel1D(
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
result += s[k + kernel_offset] * kernel[k]; result += s[k + kernel_offset] * kernel[k];
} }
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
buffer[tensor_index] = result; buffer[tensor_index] = result;
} }
__syncthreads(); __syncthreads();
@ -625,7 +629,11 @@ __global__ void EigenConvolutionKernel2D(
const float* __restrict kernel, const int numPlanes, const int numX, const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int numY, const int maxY, const int kernelSizeX, const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) { const int kernelSizeY, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[]; extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@ -642,7 +650,7 @@ __global__ void EigenConvolutionKernel2D(
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.z * num_y_input; const int plane_kernel_offset = threadIdx.z * num_y_input;
// Load inputs to shared memory // Load inputs to shared memory
@ -651,7 +659,7 @@ __global__ void EigenConvolutionKernel2D(
const int input_offset = num_x_input * (j + plane_kernel_offset); const int input_offset = num_x_input * (j + plane_kernel_offset);
#pragma unroll #pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
s[i + input_offset] = eval.coeff(tensor_index); s[i + input_offset] = eval.coeff(tensor_index);
} }
} }
@ -659,7 +667,7 @@ __global__ void EigenConvolutionKernel2D(
__syncthreads(); __syncthreads();
// Convolution // Convolution
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll #pragma unroll
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@ -675,7 +683,7 @@ __global__ void EigenConvolutionKernel2D(
result += s[k + input_offset] * kernel[k + kernel_offset]; result += s[k + input_offset] * kernel[k + kernel_offset];
} }
} }
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
buffer[tensor_index] = result; buffer[tensor_index] = result;
} }
} }
@ -693,7 +701,11 @@ __global__ void EigenConvolutionKernel3D(
const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) { const size_t kernelSizeZ, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[]; extern __shared__ float s[];
#endif
// Load inputs to shared memory // Load inputs to shared memory
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
@ -710,13 +722,13 @@ __global__ void EigenConvolutionKernel3D(
for (int p = 0; p < numPlanes; ++p) { for (int p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0; const int plane_kernel_offset = 0;
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
} }
} }
@ -728,7 +740,7 @@ __global__ void EigenConvolutionKernel3D(
const int num_z_output = last_z - first_z + 1; const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1; const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1; const int num_x_output = last_x - first_x + 1;
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@ -741,7 +753,7 @@ __global__ void EigenConvolutionKernel3D(
} }
} }
} }
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
buffer[tensor_index] = result; buffer[tensor_index] = result;
} }
} }
@ -854,9 +866,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
const int maxSharedMem = m_device.sharedMemPerBlock(); const int maxSharedMem = m_device.sharedMemPerBlock();
const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
const int warpSize = 32; const int warpSize = 32;
switch (NumKernelDims) { switch (NumKernelDims) {
@ -908,15 +920,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_inputImpl.dimensions(), kernel_dims, indices); m_inputImpl.dimensions(), kernel_dims, indices);
switch(kernel_size) { switch(kernel_size) {
case 4: { case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
break; break;
} }
case 7: { case 7: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
} }
} }
break; break;
@ -969,11 +981,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
case 4: { case 4: {
switch (kernel_size_y) { switch (kernel_size_y) {
case 7: { case 7: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
break; break;
} }
} }
@ -982,18 +994,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
case 7: { case 7: {
switch (kernel_size_y) { switch (kernel_size_y) {
case 4: { case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
break; break;
} }
} }
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
break; break;
} }
} }
@ -1039,7 +1051,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
m_inputImpl.dimensions(), kernel_dims, indices); m_inputImpl.dimensions(), kernel_dims, indices);
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
break; break;
} }

View File

@ -35,9 +35,12 @@ struct DefaultDevice {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
#ifndef EIGEN_CUDA_ARCH #if !defined(EIGEN_GPU_COMPILE_PHASE)
// Running on the host CPU // Running on the host CPU
return 1; return 1;
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return 64;
#else #else
// Running on a CUDA device // Running on a CUDA device
return 32; return 32;
@ -45,9 +48,12 @@ struct DefaultDevice {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) #if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
// Running on the host CPU // Running on the host CPU
return l1CacheSize(); return l1CacheSize();
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return 48*1024; // FIXME : update this number for HIP
#else #else
// Running on a CUDA device, return the amount of shared memory available. // Running on a CUDA device, return the amount of shared memory available.
return 48*1024; return 48*1024;
@ -55,9 +61,12 @@ struct DefaultDevice {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) #if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
// Running single threaded on the host CPU // Running single threaded on the host CPU
return l3CacheSize(); return l3CacheSize();
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return firstLevelCacheSize(); // FIXME : update this number for HIP
#else #else
// Running on a CUDA device // Running on a CUDA device
return firstLevelCacheSize(); return firstLevelCacheSize();
@ -65,10 +74,14 @@ struct DefaultDevice {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef EIGEN_CUDA_ARCH #if !defined(EIGEN_GPU_COMPILE_PHASE)
// Running single threaded on the host CPU // Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU // Should return an enum that encodes the ISA supported by the CPU
return 1; return 1;
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
// return 1 as major for HIP
return 1;
#else #else
// Running on a CUDA device // Running on a CUDA device
return EIGEN_CUDA_ARCH / 100; return EIGEN_CUDA_ARCH / 100;

View File

@ -7,21 +7,26 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
// This header file container defines fo gpu* macros which will resolve to
// their equivalent hip* or cuda* versions depending on the compiler in use
// A separte header (included at the end of this file) will undefine all
#include "TensorGpuHipCudaDefines.h"
namespace Eigen { namespace Eigen {
static const int kCudaScratchSize = 1024; static const int kGpuScratchSize = 1024;
// This defines an interface that GPUDevice can take to use // This defines an interface that GPUDevice can take to use
// CUDA streams underneath. // HIP / CUDA streams underneath.
class StreamInterface { class StreamInterface {
public: public:
virtual ~StreamInterface() {} virtual ~StreamInterface() {}
virtual const cudaStream_t& stream() const = 0; virtual const gpuStream_t& stream() const = 0;
virtual const cudaDeviceProp& deviceProperties() const = 0; virtual const gpuDeviceProp_t& deviceProperties() const = 0;
// Allocate memory on the actual device where the computation will run // Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0; virtual void* allocate(size_t num_bytes) const = 0;
@ -37,7 +42,7 @@ class StreamInterface {
virtual unsigned int* semaphore() const = 0; virtual unsigned int* semaphore() const = 0;
}; };
static cudaDeviceProp* m_deviceProperties; static gpuDeviceProp_t* m_deviceProperties;
static bool m_devicePropInitialized = false; static bool m_devicePropInitialized = false;
static void initializeDeviceProp() { static void initializeDeviceProp() {
@ -58,23 +63,23 @@ static void initializeDeviceProp() {
#endif #endif
// We're the first thread to reach this point. // We're the first thread to reach this point.
int num_devices; int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices); gpuError_t status = gpuGetDeviceCount(&num_devices);
if (status != cudaSuccess) { if (status != gpuSuccess) {
std::cerr << "Failed to get the number of CUDA devices: " std::cerr << "Failed to get the number of GPU devices: "
<< cudaGetErrorString(status) << gpuGetErrorString(status)
<< std::endl; << std::endl;
assert(status == cudaSuccess); assert(status == gpuSuccess);
} }
m_deviceProperties = new cudaDeviceProp[num_devices]; m_deviceProperties = new gpuDeviceProp_t[num_devices];
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
status = cudaGetDeviceProperties(&m_deviceProperties[i], i); status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
if (status != cudaSuccess) { if (status != gpuSuccess) {
std::cerr << "Failed to initialize CUDA device #" std::cerr << "Failed to initialize GPU device #"
<< i << i
<< ": " << ": "
<< cudaGetErrorString(status) << gpuGetErrorString(status)
<< std::endl; << std::endl;
assert(status == cudaSuccess); assert(status == gpuSuccess);
} }
} }
@ -94,87 +99,87 @@ static void initializeDeviceProp() {
} }
} }
static const cudaStream_t default_stream = cudaStreamDefault; static const gpuStream_t default_stream = gpuStreamDefault;
class CudaStreamDevice : public StreamInterface { class GpuStreamDevice : public StreamInterface {
public: public:
// Use the default stream on the current device // Use the default stream on the current device
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
cudaGetDevice(&device_); gpuGetDevice(&device_);
initializeDeviceProp(); initializeDeviceProp();
} }
// Use the default stream on the specified device // Use the default stream on the specified device
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp(); initializeDeviceProp();
} }
// Use the specified stream. Note that it's the // Use the specified stream. Note that it's the
// caller responsibility to ensure that the stream can run on // caller responsibility to ensure that the stream can run on
// the specified device. If no device is specified the code // the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device. // assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1) GpuStreamDevice(const gpuStream_t* stream, int device = -1)
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) { if (device < 0) {
cudaGetDevice(&device_); gpuGetDevice(&device_);
} else { } else {
int num_devices; int num_devices;
cudaError_t err = cudaGetDeviceCount(&num_devices); gpuError_t err = gpuGetDeviceCount(&num_devices);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
assert(device < num_devices); assert(device < num_devices);
device_ = device; device_ = device;
} }
initializeDeviceProp(); initializeDeviceProp();
} }
virtual ~CudaStreamDevice() { virtual ~GpuStreamDevice() {
if (scratch_) { if (scratch_) {
deallocate(scratch_); deallocate(scratch_);
} }
} }
const cudaStream_t& stream() const { return *stream_; } const gpuStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const { const gpuDeviceProp_t& deviceProperties() const {
return m_deviceProperties[device_]; return m_deviceProperties[device_];
} }
virtual void* allocate(size_t num_bytes) const { virtual void* allocate(size_t num_bytes) const {
cudaError_t err = cudaSetDevice(device_); gpuError_t err = gpuSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
void* result; void* result;
err = cudaMalloc(&result, num_bytes); err = gpuMalloc(&result, num_bytes);
assert(err == cudaSuccess); assert(err == gpuSuccess);
assert(result != NULL); assert(result != NULL);
return result; return result;
} }
virtual void deallocate(void* buffer) const { virtual void deallocate(void* buffer) const {
cudaError_t err = cudaSetDevice(device_); gpuError_t err = gpuSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
assert(buffer != NULL); assert(buffer != NULL);
err = cudaFree(buffer); err = gpuFree(buffer);
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
virtual void* scratchpad() const { virtual void* scratchpad() const {
if (scratch_ == NULL) { if (scratch_ == NULL) {
scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
} }
return scratch_; return scratch_;
} }
virtual unsigned int* semaphore() const { virtual unsigned int* semaphore() const {
if (semaphore_ == NULL) { if (semaphore_ == NULL) {
char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize; char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(scratch); semaphore_ = reinterpret_cast<unsigned int*>(scratch);
cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
return semaphore_; return semaphore_;
} }
private: private:
const cudaStream_t* stream_; const gpuStream_t* stream_;
int device_; int device_;
mutable void* scratch_; mutable void* scratch_;
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
@ -190,7 +195,7 @@ struct GpuDevice {
eigen_assert(stream); eigen_assert(stream);
} }
// TODO(bsteiner): This is an internal API, we should not expose it. // TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const cudaStream_t& stream() const { EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
return stream_->stream(); return stream_->stream();
} }
@ -211,11 +216,11 @@ struct GpuDevice {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
stream_->stream()); stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
#else #else
EIGEN_UNUSED_VARIABLE(dst); EIGEN_UNUSED_VARIABLE(dst);
EIGEN_UNUSED_VARIABLE(src); EIGEN_UNUSED_VARIABLE(src);
@ -225,24 +230,24 @@ struct GpuDevice {
} }
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
cudaError_t err = gpuError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
cudaError_t err = gpuError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
#else #else
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
@ -260,31 +265,31 @@ struct GpuDevice {
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and // We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices. // there is no l3 cache on hip/cuda devices.
return firstLevelCacheSize(); return firstLevelCacheSize();
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
cudaError_t err = cudaStreamSynchronize(stream_->stream()); gpuError_t err = gpuStreamSynchronize(stream_->stream());
if (err != cudaSuccess) { if (err != gpuSuccess) {
std::cerr << "Error detected in CUDA stream: " std::cerr << "Error detected in GPU stream: "
<< cudaGetErrorString(err) << gpuGetErrorString(err)
<< std::endl; << std::endl;
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
#else #else
assert(false && "The default device should be used instead to generate kernel code"); assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }
EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount; return stream_->deviceProperties().multiProcessorCount;
} }
EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock; return stream_->deviceProperties().maxThreadsPerBlock;
} }
EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor; return stream_->deviceProperties().maxThreadsPerMultiProcessor;
} }
EIGEN_STRONG_INLINE int sharedMemPerBlock() const { EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
@ -301,12 +306,12 @@ struct GpuDevice {
return max_blocks_; return max_blocks_;
} }
// This function checks if the CUDA runtime recorded an error for the // This function checks if the GPU runtime recorded an error for the
// underlying stream device. // underlying stream device.
inline bool ok() const { inline bool ok() const {
#ifdef EIGEN_CUDACC #ifdef EIGEN_GPUCC
cudaError_t error = cudaStreamQuery(stream_->stream()); gpuError_t error = gpuStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady); return (error == gpuSuccess) || (error == gpuErrorNotReady);
#else #else
return false; return false;
#endif #endif
@ -317,18 +322,27 @@ struct GpuDevice {
int max_blocks_; int max_blocks_;
}; };
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ #if defined(EIGEN_HIPCC)
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
assert(hipGetLastError() == hipSuccess);
#else
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess); assert(cudaGetLastError() == cudaSuccess);
#endif
// FIXME: Should be device and kernel specific. // FIXME: Should be device and kernel specific.
#ifdef EIGEN_CUDACC #ifdef EIGEN_GPUCC
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
cudaError_t status = cudaDeviceSetSharedMemConfig(config); gpuError_t status = gpuDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status) EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess); assert(status == gpuSuccess);
#else #else
EIGEN_UNUSED_VARIABLE(config) EIGEN_UNUSED_VARIABLE(config)
#endif #endif
@ -337,4 +351,7 @@ static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H // undefine all the gpu* macros we defined at the beginning of the file
#include "TensorGpuHipCudaUndefines.h"
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

View File

@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
}; };
#if defined(EIGEN_CUDACC) #if defined(EIGEN_GPUCC)
template <typename Evaluator, typename Index, bool Vectorizable> template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval { struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE static __device__ EIGEN_ALWAYS_INLINE
@ -250,21 +250,22 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) { if (needs_assign) {
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() * const int block_size = device.maxGpuThreadsPerBlock();
device.maxCudaThreadsPerMultiProcessor() / block_size; const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_CUDA_KERNEL( LAUNCH_GPU_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size); num_blocks, block_size, 0, device, evaluator, size);
} }
evaluator.cleanup(); evaluator.cleanup();
} }
#endif // EIGEN_CUDACC #endif // EIGEN_GPUCC
#endif // EIGEN_USE_GPU #endif // EIGEN_USE_GPU
// SYCL Executor policy // SYCL Executor policy

View File

@ -109,7 +109,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { #if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
const Index numValues = internal::array_prod(m_impl.dimensions()); const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types. // Should initialize the memory in case we're dealing with non POD types.

View File

@ -0,0 +1,87 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
// This will obsviously not work when trying to compile tensorflow on a sytem with no CUDA
// To work around this issue for HIP systems (and leave the default behaviour intact), the
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
#if defined(EIGEN_USE_HIP)
#define gpuStream_t hipStream_t
#define gpuDeviceProp_t hipDeviceProp_t
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#define gpuErrorNotReady hipErrorNotReady
#define gpuGetDeviceCount hipGetDeviceCount
#define gpuGetErrorString hipGetErrorString
#define gpuGetDeviceProperties hipGetDeviceProperties
#define gpuStreamDefault hipStreamDefault
#define gpuGetDevice hipGetDevice
#define gpuSetDevice hipSetDevice
#define gpuMalloc hipMalloc
#define gpuFree hipFree
#define gpuMemsetAsync hipMemsetAsync
#define gpuMemcpyAsync hipMemcpyAsync
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
#define gpuStreamQuery hipStreamQuery
#define gpuSharedMemConfig hipSharedMemConfig
#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
#define gpuStreamSynchronize hipStreamSynchronize
#define gpuDeviceSynchronize hipDeviceSynchronize
#define gpuMemcpy hipMemcpy
#else
#define gpuStream_t cudaStream_t
#define gpuDeviceProp_t cudaDeviceProp
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#define gpuErrorNotReady cudaErrorNotReady
#define gpuGetDeviceCount cudaGetDeviceCount
#define gpuGetErrorString cudaGetErrorString
#define gpuGetDeviceProperties cudaGetDeviceProperties
#define gpuStreamDefault cudaStreamDefault
#define gpuGetDevice cudaGetDevice
#define gpuSetDevice cudaSetDevice
#define gpuMalloc cudaMalloc
#define gpuFree cudaFree
#define gpuMemsetAsync cudaMemsetAsync
#define gpuMemcpyAsync cudaMemcpyAsync
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
#define gpuStreamQuery cudaStreamQuery
#define gpuSharedMemConfig cudaSharedMemConfig
#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
#define gpuStreamSynchronize cudaStreamSynchronize
#define gpuDeviceSynchronize cudaDeviceSynchronize
#define gpuMemcpy cudaMemcpy
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIPCC does not support the use of assert on the GPU side.
#undef assert
#define assert(COND)
#endif
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

View File

@ -0,0 +1,40 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#undef gpuStream_t
#undef gpuDeviceProp_t
#undef gpuError_t
#undef gpuSuccess
#undef gpuErrorNotReady
#undef gpuGetDeviceCount
#undef gpuGetErrorString
#undef gpuGetDeviceProperties
#undef gpuStreamDefault
#undef gpuGetDevice
#undef gpuSetDevice
#undef gpuMalloc
#undef gpuFree
#undef gpuMemsetAsync
#undef gpuMemcpyAsync
#undef gpuMemcpyDeviceToDevice
#undef gpuMemcpyDeviceToHost
#undef gpuMemcpyHostToDevice
#undef gpuStreamQuery
#undef gpuSharedMemConfig
#undef gpuDeviceSetSharedMemConfig
#undef gpuStreamSynchronize
#undef gpuDeviceSynchronize
#undef gpuMemcpy
#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

View File

@ -350,7 +350,8 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
namespace internal { namespace internal {
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) { template<typename FirstType, typename... OtherTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
size_t result = 1; size_t result = 1;
for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) { for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
result *= sizes[i]; result *= sizes[i];

View File

@ -35,7 +35,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val) typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{ {
#ifdef EIGEN_CUDA_ARCH #ifdef EIGEN_GPU_COMPILE_PHASE
return __clz(val); return __clz(val);
#elif defined(__SYCL_DEVICE_ONLY__) #elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val); return cl::sycl::clz(val);
@ -53,7 +53,7 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val) typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
{ {
#ifdef EIGEN_CUDA_ARCH #ifdef EIGEN_GPU_COMPILE_PHASE
return __clzll(val); return __clzll(val);
#elif defined(__SYCL_DEVICE_ONLY__) #elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::clz(val); return cl::sycl::clz(val);
@ -90,7 +90,7 @@ namespace {
template <typename T> template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
return __umulhi(a, b); return __umulhi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__) #elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b)); return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
@ -101,7 +101,7 @@ namespace {
template <typename T> template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPU_COMPILE_PHASE)
return __umul64hi(a, b); return __umul64hi(a, b);
#elif defined(__SYCL_DEVICE_ONLY__) #elif defined(__SYCL_DEVICE_ONLY__)
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b)); return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
@ -124,7 +124,7 @@ namespace {
template <typename T> template <typename T>
struct DividerHelper<64, T> { struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) #if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else #else
const uint64_t shift = 1ULL << log_div; const uint64_t shift = 1ULL << log_div;
@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
} }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef EIGEN_CUDA_ARCH #ifdef EIGEN_GPU_COMPILE_PHASE
return (__umulhi(magic, n) >> shift); return (__umulhi(magic, n) >> shift);
#elif defined(__SYCL_DEVICE_ONLY__) #elif defined(__SYCL_DEVICE_ONLY__)
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift); return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);

View File

@ -27,7 +27,7 @@
*/ */
// SFINAE requires variadic templates // SFINAE requires variadic templates
#ifndef EIGEN_CUDACC #if !defined(EIGEN_GPUCC)
#if EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7 // SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC #ifdef EIGEN_COMP_GNUC

View File

@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
}; };
// For CUDA packet types when using a GpuDevice // For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16) #if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
template <> template <>
struct PacketType<half, GpuDevice> { struct PacketType<half, GpuDevice> {
typedef half2 type; typedef half2 type;

View File

@ -858,8 +858,8 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
} }
return inputIndex; return inputIndex;
} }
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
#ifndef __SYCL_DEVICE_ONLY__ #ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value)); return numext::maxi(min, numext::mini(max,value));
#else #else

View File

@ -16,7 +16,7 @@ namespace internal {
namespace { namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() { EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#ifdef EIGEN_CUDA_ARCH #if defined(EIGEN_GPU_COMPILE_PHASE)
// We don't support 3d kernels since we currently only use 1 and // We don't support 3d kernels since we currently only use 1 and
// 2d kernels. // 2d kernels.
assert(threadIdx.z == 0); assert(threadIdx.z == 0);

View File

@ -334,12 +334,12 @@ struct OuterReducer {
}; };
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I> template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16 #if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I> template <typename S, typename R, typename I>
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I> template <int B, int N, typename S, typename R, typename I>
@ -495,7 +495,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) { EIGEN_STRONG_INLINE
#if !defined(EIGEN_HIPCC)
// Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same for all the functions
// being called within here, which then leads to proliferation of EIGEN_DEVICE_FUNC markings, one
// of which will eventually result in an NVCC error
EIGEN_DEVICE_FUNC
#endif
bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
m_impl.evalSubExprsIfNeeded(NULL); m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible. // Use the FullReducer if possible.
@ -694,9 +701,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard; template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif #endif
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16 #if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
@ -781,7 +788,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
Op m_reducer; Op m_reducer;
// For full reductions // For full reductions
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false; static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL) #elif defined(EIGEN_USE_SYCL)

View File

@ -7,23 +7,23 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
// Full reducers for GPU, don't vectorize for now // Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple cuda thread to safely accumulate at the same // Reducer function that enables multiple gpu thread to safely accumulate at the same
// output address. It basically reads the current value of the output variable, and // output address. It basically reads the current value of the output variable, and
// attempts to update it with the new value. If in the meantime another cuda thread // attempts to update it with the new value. If in the meantime another gpu thread
// updated the content of the output address it will try again. // updated the content of the output address it will try again.
template <typename T, typename R> template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
if (sizeof(T) == 4) if (sizeof(T) == 4)
{ {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@ -79,7 +79,7 @@ __device__ inline double atomicExchCustom(double* address, double val) {
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <template <typename T> class R> template <template <typename T> class R>
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@ -98,11 +98,11 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
} }
} }
} }
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <> template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
atomicAdd(output, accum); atomicAdd(output, accum);
#else // EIGEN_CUDA_ARCH >= 300 #else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device"); assert(0 && "Shouldn't be called on unsupported device");
@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) { typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
// Initialize the output value // Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) { if (gridDim.x == 1) {
@ -168,7 +168,16 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// use std::is_floating_point to determine the type of reduced_val
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
// and list the float and int versions of __shfl_down as the candidate functions.
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
}
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
#else #else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
@ -182,6 +191,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
if (gridDim.x > 1 && threadIdx.x == 0) { if (gridDim.x > 1 && threadIdx.x == 0) {
// Let the last block reset the semaphore // Let the last block reset the semaphore
atomicInc(semaphore, gridDim.x + 1); atomicInc(semaphore, gridDim.x + 1);
#if defined(EIGEN_HIPCC)
__threadfence_system();
#endif
} }
#else // EIGEN_CUDA_ARCH >= 300 #else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device"); assert(0 && "Shouldn't be called on unsupported device");
@ -189,7 +201,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, template <typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
@ -227,6 +239,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
// Initialize the output value if it wasn't initialized by the ReductionInitKernel // Initialize the output value if it wasn't initialized by the ReductionInitKernel
if (gridDim.x == 1) { if (gridDim.x == 1) {
if (first_index == 0) { if (first_index == 0) {
if (num_coeffs % 2 != 0) { if (num_coeffs % 2 != 0) {
@ -238,7 +251,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
} }
__syncthreads(); __syncthreads();
} }
half2 accum = reducer.template initializePacket<half2>(); half2 accum = reducer.template initializePacket<half2>();
const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
for (Index i = 0; i < max_iter; i += BlockSize) { for (Index i = 0; i < max_iter; i += BlockSize) {
@ -250,7 +263,13 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = accum;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &accum);
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
#else #else
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
@ -280,7 +299,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
*output = tmp; *output = tmp;
} }
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher { struct FullReductionLauncher {
@ -298,6 +317,7 @@ struct FullReductionLauncher<
internal::is_same<double, OutputType>::value, internal::is_same<double, OutputType>::value,
void>::type> { void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index; typedef typename Self::Index Index;
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 128; const int num_per_thread = 128;
@ -308,12 +328,12 @@ struct FullReductionLauncher<
semaphore = device.semaphore(); semaphore = device.semaphore();
} }
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
} }
}; };
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, typename Op> template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, false> { struct FullReductionLauncher<Self, Op, Eigen::half, false> {
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@ -334,20 +354,20 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks. // won't be a race conditions between multiple thread blocks.
LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_coeffs, scratch); 1, 1, 0, device, reducer, self, num_coeffs, scratch);
} }
LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
if (num_blocks > 1) { if (num_blocks > 1) {
LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>), LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
1, 1, 0, device, reducer, output, scratch); 1, 1, 0, device, reducer, output, scratch);
} }
} }
}; };
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, bool Vectorizable> template <typename Self, typename Op, bool Vectorizable>
@ -355,16 +375,16 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex, // Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple cases // so reduce the scope of the optimized version of the code to the simple cases
// of doubles, floats and half floats // of doubles, floats and half floats
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value || internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_CUDA_FP16 #else // EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename OutputType> template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@ -384,7 +404,7 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) { typename Self::CoeffReturnType* output) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
typedef typename Self::CoeffReturnType Type; typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1); eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1); eigen_assert(blockDim.z == 1);
@ -437,7 +457,16 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// use std::is_floating_point to determine the type of reduced_val
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
// and list the float and int versions of __shfl_down as the candidate functions.
if (std::is_floating_point<Type>::value) {
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
}
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
#else #else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
@ -454,7 +483,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
#endif // EIGEN_CUDA_ARCH >= 300 #endif // EIGEN_CUDA_ARCH >= 300
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <int NumPerThread, typename Self, template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
@ -531,7 +560,18 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = reduced_val1;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &reduced_val1);
wka_in.h = reduced_val2;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &reduced_val2);
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
#else #else
@ -556,7 +596,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
} }
} }
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher { struct InnerReductionLauncher {
@ -581,30 +621,30 @@ struct InnerReductionLauncher<
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 128; const int num_per_thread = 128;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there // We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks. // won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024); const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024; device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(), num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output); num_preserved_vals, output);
} }
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false; return false;
} }
}; };
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, typename Op> template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, false> { struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@ -627,28 +667,28 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
const int block_size = /*256*/128; const int block_size = /*256*/128;
const int num_per_thread = /*128*/64; const int num_per_thread = /*128*/64;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there // We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks. // won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024); const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024; device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_preserved_vals, output); 1, 1, 0, device, reducer, self, num_preserved_vals, output);
} }
LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false; return false;
} }
}; };
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op> template <typename Self, typename Op>
@ -656,16 +696,16 @@ struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex, // Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case // so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats. // of floats and half floats.
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value || internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_CUDA_FP16 #else // EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename OutputType> template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
@ -723,7 +763,20 @@ struct OuterReducer<Self, Op, GpuDevice> {
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType> template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { static
#if !defined(EIGEN_HIPCC)
// FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
// (in the cxx11_tensor_reduction_gpu test)
//
// terminate called after throwing an instance of 'std::runtime_error'
// what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
//
// dont know why this happens (and why is it a runtime error instead of a compile time errror)
//
// this will be fixed by HIP PR#457
EIGEN_DEVICE_FUNC
#endif
bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce doubles or floats on a gpu device"); assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true; return true;
} }
@ -740,33 +793,33 @@ struct OuterReducer<Self, Op, GpuDevice> {
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 16; const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs in the reduction kernel itself when we don't have to worry // We initialize the outputs in the reduction kernel itself when we don't have to worry
// about race conditions between multiple thread blocks. // about race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024); const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024; device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
num_blocks, 1024, 0, device, reducer.initialize(), num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output); num_preserved_vals, output);
} }
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false; return false;
} }
}; };
#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__) #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H

View File

@ -242,7 +242,7 @@ struct ScanLauncher {
} }
}; };
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
// GPU implementation of scan // GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in // TODO(ibab) This placeholder implementation performs multiple scans in
@ -278,10 +278,11 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
Index total_size = internal::array_prod(self.dimensions()); Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64; Index num_blocks = (total_size / self.size() + 63) / 64;
Index block_size = 64; Index block_size = 64;
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
} }
}; };
#endif // EIGEN_USE_GPU && EIGEN_CUDACC #endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
} // end namespace Eigen } // end namespace Eigen

View File

@ -268,7 +268,7 @@ template<
typename Reducer typename Reducer
> struct reduce<Reducer> > struct reduce<Reducer>
{ {
constexpr static inline int run() { return Reducer::Identity; } EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; }
}; };
template< template<
@ -276,7 +276,7 @@ template<
typename A typename A
> struct reduce<Reducer, A> > struct reduce<Reducer, A>
{ {
constexpr static inline A run(A a) { return a; } EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; }
}; };
template< template<
@ -285,7 +285,7 @@ template<
typename... Ts typename... Ts
> struct reduce<Reducer, A, Ts...> > struct reduce<Reducer, A, Ts...>
{ {
constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) { EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...)); return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
} }
}; };
@ -324,7 +324,7 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
// does... // does...
template<typename... Ts> template<typename... Ts>
constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts) EIGEN_DEVICE_FUNC constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
{ {
return reduce<product_op, Ts...>::run(ts...); return reduce<product_op, Ts...>::run(ts...);
} }

View File

@ -15,7 +15,7 @@
// The array class is only available starting with cxx11. Emulate our own here // The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead. // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY) #if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen { namespace Eigen {
template <typename T, size_t n> class array { template <typename T, size_t n> class array {

View File

@ -53,8 +53,8 @@ namespace Eigen {
#include "src/SpecialFunctions/SpecialFunctionsFunctors.h" #include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h" #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
#if defined EIGEN_VECTORIZE_CUDA #if defined EIGEN_VECTORIZE_GPU
#include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h" #include "src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h"
#endif #endif
namespace Eigen { namespace Eigen {

View File

@ -190,7 +190,7 @@ template <>
struct lgamma_impl<float> { struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) { static EIGEN_STRONG_INLINE float run(float x) {
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) #if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy; int dummy;
return ::lgammaf_r(x, &dummy); return ::lgammaf_r(x, &dummy);
#else #else
@ -203,7 +203,7 @@ template <>
struct lgamma_impl<double> { struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) { static EIGEN_STRONG_INLINE double run(double x) {
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) #if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int dummy; int dummy;
return ::lgamma_r(x, &dummy); return ::lgamma_r(x, &dummy);
#else #else

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H #ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
#define EIGEN_CUDA_SPECIALFUNCTIONS_H #define EIGEN_GPU_SPECIALFUNCTIONS_H
namespace Eigen { namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to // Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones // introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...) // we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plgamma<float4>(const float4& a) float4 plgamma<float4>(const float4& a)
@ -223,4 +223,4 @@ pi1e<double2>(const double2& x) {
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H #endif // EIGEN_GPU_SPECIALFUNCTIONS_H

View File

@ -275,26 +275,83 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(cxx11_tensor_complex_cuda) ei_add_test(cxx11_tensor_complex_gpu)
ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_reduction_gpu)
ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_argmax_gpu)
ei_add_test(cxx11_tensor_cast_float16_cuda) ei_add_test(cxx11_tensor_cast_float16_gpu)
ei_add_test(cxx11_tensor_scan_cuda) ei_add_test(cxx11_tensor_scan_gpu)
# Contractions require arch 3.0 or higher # Contractions require arch 3.0 or higher
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29) if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
ei_add_test(cxx11_tensor_device) ei_add_test(cxx11_tensor_device)
ei_add_test(cxx11_tensor_cuda) ei_add_test(cxx11_tensor_gpu)
ei_add_test(cxx11_tensor_contract_cuda) ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_of_float16_cuda) ei_add_test(cxx11_tensor_of_float16_gpu)
endif() endif()
# The random number generation code requires arch 3.5 or greater. # The random number generation code requires arch 3.5 or greater.
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
ei_add_test(cxx11_tensor_random_cuda) ei_add_test(cxx11_tensor_random_gpu)
endif() endif()
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif() endif()
# Add HIP specific tests
if (EIGEN_TEST_HIP)
set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
if (EXISTS ${HIP_PATH})
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
find_package(HIP REQUIRED)
if (HIP_FOUND)
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
if (${HIP_PLATFORM} STREQUAL "hcc")
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${HIP_PATH}/include)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
#
# complex datatype is not yet supported by HIP
# so leaving out those tests for now
#
# ei_add_test(cxx11_tensor_complex_gpu)
# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
#
ei_add_test(cxx11_tensor_reduction_gpu)
ei_add_test(cxx11_tensor_argmax_gpu)
ei_add_test(cxx11_tensor_cast_float16_gpu)
ei_add_test(cxx11_tensor_scan_gpu)
ei_add_test(cxx11_tensor_device)
ei_add_test(cxx11_tensor_gpu)
ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_of_float16_gpu)
ei_add_test(cxx11_tensor_random_gpu)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
elseif (${HIP_PLATFORM} STREQUAL "nvcc")
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
else ()
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
endif()
endif(HIP_FOUND)
else ()
message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
endif()
endif(EIGEN_TEST_HIP)

View File

@ -9,16 +9,18 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_TEST_FUNC cxx11_tensor_gpu
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
template <int Layout> template <int Layout>
void test_cuda_simple_argmax() void test_gpu_simple_argmax()
{ {
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@ -34,13 +36,13 @@ void test_cuda_simple_argmax()
double* d_in; double* d_in;
DenseIndex* d_out_max; DenseIndex* d_out_max;
DenseIndex* d_out_min; DenseIndex* d_out_min;
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out_max), out_bytes); gpuMalloc((void**)(&d_out_max), out_bytes);
cudaMalloc((void**)(&d_out_min), out_bytes); gpuMalloc((void**)(&d_out_min), out_bytes);
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@ -50,20 +52,20 @@ void test_cuda_simple_argmax()
gpu_out_max.device(gpu_device) = gpu_in.argmax(); gpu_out_max.device(gpu_device) = gpu_in.argmax();
gpu_out_min.device(gpu_device) = gpu_in.argmin(); gpu_out_min.device(gpu_device) = gpu_in.argmin();
assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out_max); gpuFree(d_out_max);
cudaFree(d_out_min); gpuFree(d_out_min);
} }
template <int DataLayout> template <int DataLayout>
void test_cuda_argmax_dim() void test_gpu_argmax_dim()
{ {
Tensor<float, 4, DataLayout> tensor(2,3,5,7); Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims; std::vector<int> dims;
@ -97,12 +99,12 @@ void test_cuda_argmax_dim()
float* d_in; float* d_in;
DenseIndex* d_out; DenseIndex* d_out;
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@ -110,8 +112,8 @@ void test_cuda_argmax_dim()
gpu_out.device(gpu_device) = gpu_in.argmax(dim); gpu_out.device(gpu_device) = gpu_in.argmax(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(), VERIFY_IS_EQUAL(tensor_arg.size(),
size_t(2*3*5*7 / tensor.dimension(dim))); size_t(2*3*5*7 / tensor.dimension(dim)));
@ -134,25 +136,25 @@ void test_cuda_argmax_dim()
} }
} }
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmax(dim); gpu_out.device(gpu_device) = gpu_in.argmax(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension // Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
} }
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out); gpuFree(d_out);
} }
} }
template <int DataLayout> template <int DataLayout>
void test_cuda_argmin_dim() void test_gpu_argmin_dim()
{ {
Tensor<float, 4, DataLayout> tensor(2,3,5,7); Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims; std::vector<int> dims;
@ -186,12 +188,12 @@ void test_cuda_argmin_dim()
float* d_in; float* d_in;
DenseIndex* d_out; DenseIndex* d_out;
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@ -199,8 +201,8 @@ void test_cuda_argmin_dim()
gpu_out.device(gpu_device) = gpu_in.argmin(dim); gpu_out.device(gpu_device) = gpu_in.argmin(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(), VERIFY_IS_EQUAL(tensor_arg.size(),
2*3*5*7 / tensor.dimension(dim)); 2*3*5*7 / tensor.dimension(dim));
@ -223,29 +225,29 @@ void test_cuda_argmin_dim()
} }
} }
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmin(dim); gpu_out.device(gpu_device) = gpu_in.argmin(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension // Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
} }
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out); gpuFree(d_out);
} }
} }
void test_cxx11_tensor_cuda() void test_cxx11_tensor_gpu()
{ {
CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
} }

View File

@ -9,7 +9,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda #define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -18,8 +18,8 @@
using Eigen::Tensor; using Eigen::Tensor;
void test_cuda_conversion() { void test_gpu_conversion() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -72,8 +72,8 @@ void test_fallback_conversion() {
} }
void test_cxx11_tensor_cast_float16_cuda() void test_cxx11_tensor_cast_float16_gpu()
{ {
CALL_SUBTEST(test_cuda_conversion()); CALL_SUBTEST(test_gpu_conversion());
CALL_SUBTEST(test_fallback_conversion()); CALL_SUBTEST(test_fallback_conversion());
} }

View File

@ -28,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
cudaMalloc((void**)(&d_in2), complex_bytes); cudaMalloc((void**)(&d_in2), complex_bytes);
cudaMalloc((void**)(&d_out), complex_bytes); cudaMalloc((void**)(&d_out), complex_bytes);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1( Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(

View File

@ -34,7 +34,7 @@ void test_cuda_nullary() {
cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1( Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@ -70,7 +70,7 @@ void test_cuda_nullary() {
static void test_cuda_sum_reductions() { static void test_cuda_sum_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);
@ -106,7 +106,7 @@ static void test_cuda_sum_reductions() {
static void test_cuda_mean_reductions() { static void test_cuda_mean_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);
@ -142,7 +142,7 @@ static void test_cuda_mean_reductions() {
static void test_cuda_product_reductions() { static void test_cuda_product_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);

View File

@ -10,19 +10,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_TEST_FUNC cxx11_tensor_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair; typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction(int m_size, int k_size, int n_size) void test_gpu_contraction(int m_size, int k_size, int n_size)
{ {
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is // with these dimensions, the output has 300 * 140 elements, which is
@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
float* d_t_right; float* d_t_right;
float* d_t_result; float* d_t_result;
cudaMalloc((void**)(&d_t_left), t_left_bytes); gpuMalloc((void**)(&d_t_left), t_left_bytes);
cudaMalloc((void**)(&d_t_right), t_right_bytes); gpuMalloc((void**)(&d_t_right), t_right_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes); gpuMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims); t_result = t_left.contract(t_right, dims);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) { for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue; continue;
@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
assert(false); assert(false);
} }
cudaFree((void*)d_t_left); gpuFree((void*)d_t_left);
cudaFree((void*)d_t_right); gpuFree((void*)d_t_right);
cudaFree((void*)d_t_result); gpuFree((void*)d_t_result);
} }
@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
float* d_t_right; float* d_t_right;
float* d_t_result; float* d_t_result;
cudaMalloc((void**)(&d_t_left), t_left_bytes); gpuMalloc((void**)(&d_t_left), t_left_bytes);
cudaMalloc((void**)(&d_t_right), t_right_bytes); gpuMalloc((void**)(&d_t_right), t_right_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes); gpuMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims); t_result = t_left.contract(t_right, dims);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
if (fabs(t_result() - t_result_gpu()) > 1e-4f && if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
std::cout << "mismatch detected: " << t_result() std::cout << "mismatch detected: " << t_result()
@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
assert(false); assert(false);
} }
cudaFree((void*)d_t_left); gpuFree((void*)d_t_left);
cudaFree((void*)d_t_right); gpuFree((void*)d_t_right);
cudaFree((void*)d_t_result); gpuFree((void*)d_t_result);
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_m() { void test_gpu_contraction_m() {
for (int k = 32; k < 256; k++) { for (int k = 32; k < 256; k++) {
test_cuda_contraction<ColMajor>(k, 128, 128); test_gpu_contraction<ColMajor>(k, 128, 128);
test_cuda_contraction<RowMajor>(k, 128, 128); test_gpu_contraction<RowMajor>(k, 128, 128);
} }
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_k() { void test_gpu_contraction_k() {
for (int k = 32; k < 256; k++) { for (int k = 32; k < 256; k++) {
test_cuda_contraction<ColMajor>(128, k, 128); test_gpu_contraction<ColMajor>(128, k, 128);
test_cuda_contraction<RowMajor>(128, k, 128); test_gpu_contraction<RowMajor>(128, k, 128);
} }
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_n() { void test_gpu_contraction_n() {
for (int k = 32; k < 256; k++) { for (int k = 32; k < 256; k++) {
test_cuda_contraction<ColMajor>(128, 128, k); test_gpu_contraction<ColMajor>(128, 128, k);
test_cuda_contraction<RowMajor>(128, 128, k); test_gpu_contraction<RowMajor>(128, 128, k);
} }
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_sizes() { void test_gpu_contraction_sizes() {
int m_sizes[] = { 31, 39, 63, 64, 65, int m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511, 127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025}; 512, 513, 1023, 1024, 1025};
@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() {
for (int i = 0; i < 15; i++) { for (int i = 0; i < 15; i++) {
for (int j = 0; j < 15; j++) { for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) { for (int k = 0; k < 17; k++) {
test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
} }
} }
} }
} }
void test_cxx11_tensor_cuda() void test_cxx11_tensor_gpu()
{ {
CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); #if !defined(EIGEN_USE_HIP)
CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); // disable these subtests for HIP
CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
#endif
} }

View File

@ -16,6 +16,7 @@
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
using Eigen::RowMajor; using Eigen::RowMajor;
@ -66,22 +67,22 @@ struct CPUContext {
// Context for evaluation on GPU // Context for evaluation on GPU
struct GPUContext { struct GPUContext {
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
float kernel_1d_val[] = {3.14f, 2.7f}; float kernel_1d_val[] = {3.14f, 2.7f};
assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
} }
~GPUContext() { ~GPUContext() {
assert(cudaFree(kernel_1d_) == cudaSuccess); assert(gpuFree(kernel_1d_) == gpuSuccess);
assert(cudaFree(kernel_2d_) == cudaSuccess); assert(gpuFree(kernel_2d_) == gpuSuccess);
assert(cudaFree(kernel_3d_) == cudaSuccess); assert(gpuFree(kernel_3d_) == gpuSuccess);
} }
const Eigen::GpuDevice& device() const { return gpu_device_; } const Eigen::GpuDevice& device() const { return gpu_device_; }
@ -102,7 +103,7 @@ struct GPUContext {
float* kernel_2d_; float* kernel_2d_;
float* kernel_3d_; float* kernel_3d_;
Eigen::CudaStreamDevice stream_; Eigen::GpuStreamDevice stream_;
Eigen::GpuDevice gpu_device_; Eigen::GpuDevice gpu_device_;
}; };
@ -281,12 +282,12 @@ void test_gpu() {
float* d_in1; float* d_in1;
float* d_in2; float* d_in2;
float* d_out; float* d_out;
cudaMalloc((void**)(&d_in1), in1_bytes); gpuMalloc((void**)(&d_in1), in1_bytes);
cudaMalloc((void**)(&d_in2), in2_bytes); gpuMalloc((void**)(&d_in2), in2_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@ -294,7 +295,7 @@ void test_gpu() {
GPUContext context(gpu_in1, gpu_in2, gpu_out); GPUContext context(gpu_in1, gpu_in2, gpu_out);
test_contextual_eval(&context); test_contextual_eval(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) { for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -304,7 +305,7 @@ void test_gpu() {
} }
test_forced_contextual_eval(&context); test_forced_contextual_eval(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) { for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -314,7 +315,7 @@ void test_gpu() {
} }
test_compound_assignment(&context); test_compound_assignment(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) { for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -324,7 +325,7 @@ void test_gpu() {
} }
test_contraction(&context); test_contraction(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) { for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0); const float result = out(i,j,0);
@ -339,8 +340,8 @@ void test_gpu() {
} }
test_1d_convolution(&context); test_1d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) { for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -350,8 +351,8 @@ void test_gpu() {
} }
test_2d_convolution(&context); test_2d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) { for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) { for (int k = 0; k < 69; ++k) {
@ -363,9 +364,13 @@ void test_gpu() {
} }
} }
#if !defined(EIGEN_USE_HIP)
// disable this test on the HIP platform
// 3D tensor convolutions seem to hang on the HIP platform
test_3d_convolution(&context); test_3d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 39; ++i) { for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) { for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) { for (int k = 0; k < 69; ++k) {
@ -378,6 +383,9 @@ void test_gpu() {
} }
} }
} }
#endif
} }

View File

@ -9,7 +9,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda #define EIGEN_TEST_FUNC cxx11_tensor_of_float16_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -20,8 +20,8 @@
using Eigen::Tensor; using Eigen::Tensor;
template<typename> template<typename>
void test_cuda_numext() { void test_gpu_numext() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -57,11 +57,11 @@ void test_cuda_numext() {
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template<typename> template<typename>
void test_cuda_conversion() { void test_gpu_conversion() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -95,8 +95,8 @@ void test_cuda_conversion() {
} }
template<typename> template<typename>
void test_cuda_unary() { void test_gpu_unary() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -132,8 +132,8 @@ void test_cuda_unary() {
} }
template<typename> template<typename>
void test_cuda_elementwise() { void test_gpu_elementwise() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -174,8 +174,8 @@ void test_cuda_elementwise() {
} }
template<typename> template<typename>
void test_cuda_trancendental() { void test_gpu_trancendental() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -268,8 +268,8 @@ void test_cuda_trancendental() {
} }
template<typename> template<typename>
void test_cuda_contractions() { void test_gpu_contractions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int rows = 23; int rows = 23;
int cols = 23; int cols = 23;
@ -319,12 +319,12 @@ void test_cuda_contractions() {
} }
template<typename> template<typename>
void test_cuda_reductions(int size1, int size2, int redux) { void test_gpu_reductions(int size1, int size2, int redux) {
std::cout << "Reducing " << size1 << " by " << size2 std::cout << "Reducing " << size1 << " by " << size2
<< " tensor along dim " << redux << std::endl; << " tensor along dim " << redux << std::endl;
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = size1*size2; int num_elem = size1*size2;
int result_size = (redux == 1 ? size1 : size2); int result_size = (redux == 1 ? size1 : size2);
@ -368,20 +368,20 @@ void test_cuda_reductions(int size1, int size2, int redux) {
} }
template<typename> template<typename>
void test_cuda_reductions() { void test_gpu_reductions() {
test_cuda_reductions<void>(13, 13, 0); test_gpu_reductions<void>(13, 13, 0);
test_cuda_reductions<void>(13, 13, 1); test_gpu_reductions<void>(13, 13, 1);
test_cuda_reductions<void>(35, 36, 0); test_gpu_reductions<void>(35, 36, 0);
test_cuda_reductions<void>(35, 36, 1); test_gpu_reductions<void>(35, 36, 1);
test_cuda_reductions<void>(36, 35, 0); test_gpu_reductions<void>(36, 35, 0);
test_cuda_reductions<void>(36, 35, 1); test_gpu_reductions<void>(36, 35, 1);
} }
template<typename> template<typename>
void test_cuda_full_reductions() { void test_gpu_full_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int size = 13; int size = 13;
int num_elem = size*size; int num_elem = size*size;
@ -429,9 +429,9 @@ void test_cuda_full_reductions() {
} }
template<typename> template<typename>
void test_cuda_forced_evals() { void test_gpu_forced_evals() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -479,20 +479,20 @@ void test_cuda_forced_evals() {
#endif #endif
void test_cxx11_tensor_of_float16_cuda() void test_cxx11_tensor_of_float16_gpu()
{ {
CALL_SUBTEST_1(test_cuda_numext<void>()); CALL_SUBTEST_1(test_gpu_numext<void>());
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
CALL_SUBTEST_1(test_cuda_conversion<void>()); CALL_SUBTEST_1(test_gpu_conversion<void>());
CALL_SUBTEST_1(test_cuda_unary<void>()); CALL_SUBTEST_1(test_gpu_unary<void>());
CALL_SUBTEST_1(test_cuda_elementwise<void>()); CALL_SUBTEST_1(test_gpu_elementwise<void>());
CALL_SUBTEST_1(test_cuda_trancendental<void>()); CALL_SUBTEST_1(test_gpu_trancendental<void>());
CALL_SUBTEST_2(test_cuda_contractions<void>()); CALL_SUBTEST_2(test_gpu_contractions<void>());
CALL_SUBTEST_3(test_cuda_reductions<void>()); CALL_SUBTEST_3(test_gpu_reductions<void>());
CALL_SUBTEST_4(test_cuda_full_reductions<void>()); CALL_SUBTEST_4(test_gpu_full_reductions<void>());
CALL_SUBTEST_5(test_cuda_forced_evals<void>()); CALL_SUBTEST_5(test_gpu_forced_evals<void>());
#else #else
std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
#endif #endif
} }

View File

@ -9,15 +9,16 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda #define EIGEN_TEST_FUNC cxx11_tensor_random_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <Eigen/CXX11/Tensor> #include <Eigen/CXX11/Tensor>
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
void test_cuda_random_uniform() void test_gpu_random_uniform()
{ {
Tensor<float, 2> out(72,97); Tensor<float, 2> out(72,97);
out.setZero(); out.setZero();
@ -25,24 +26,24 @@ void test_cuda_random_uniform()
std::size_t out_bytes = out.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float);
float* d_out; float* d_out;
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
gpu_out.device(gpu_device) = gpu_out.random(); gpu_out.device(gpu_device) = gpu_out.random();
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
// For now we just check this code doesn't crash. // For now we just check this code doesn't crash.
// TODO: come up with a valid test of randomness // TODO: come up with a valid test of randomness
} }
void test_cuda_random_normal() void test_gpu_random_normal()
{ {
Tensor<float, 2> out(72,97); Tensor<float, 2> out(72,97);
out.setZero(); out.setZero();
@ -50,9 +51,9 @@ void test_cuda_random_normal()
std::size_t out_bytes = out.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float);
float* d_out; float* d_out;
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@ -60,8 +61,8 @@ void test_cuda_random_normal()
Eigen::internal::NormalRandomGenerator<float> gen(true); Eigen::internal::NormalRandomGenerator<float> gen(true);
gpu_out.device(gpu_device) = gpu_out.random(gen); gpu_out.device(gpu_device) = gpu_out.random(gen);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
} }
static void test_complex() static void test_complex()
@ -77,9 +78,9 @@ static void test_complex()
} }
void test_cxx11_tensor_random_cuda() void test_cxx11_tensor_random_gpu()
{ {
CALL_SUBTEST(test_cuda_random_uniform()); CALL_SUBTEST(test_gpu_random_uniform());
CALL_SUBTEST(test_cuda_random_normal()); CALL_SUBTEST(test_gpu_random_normal());
CALL_SUBTEST(test_complex()); CALL_SUBTEST(test_complex());
} }

View File

@ -9,7 +9,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_TEST_FUNC cxx11_tensor_reduction_gpu
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
@ -19,7 +19,7 @@
template<typename Type, int DataLayout> template<typename Type, int DataLayout>
static void test_full_reductions() { static void test_full_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);
@ -67,7 +67,7 @@ static void test_first_dim_reductions() {
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device // Create device
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice dev(&stream); Eigen::GpuDevice dev(&stream);
// Create data(T) // Create data(T)
@ -107,7 +107,7 @@ static void test_last_dim_reductions() {
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device // Create device
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice dev(&stream); Eigen::GpuDevice dev(&stream);
// Create data // Create data
@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
} }
void test_cxx11_tensor_reduction_cuda() { void test_cxx11_tensor_reduction_gpu() {
CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));

View File

@ -9,19 +9,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda #define EIGEN_TEST_FUNC cxx11_tensor_scan_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair; typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout> template<int DataLayout>
void test_cuda_cumsum(int m_size, int k_size, int n_size) void test_gpu_cumsum(int m_size, int k_size, int n_size)
{ {
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@ -36,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
float* d_t_input; float* d_t_input;
float* d_t_result; float* d_t_result;
cudaMalloc((void**)(&d_t_input), t_input_bytes); gpuMalloc((void**)(&d_t_input), t_input_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes); gpuMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@ -52,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
t_result = t_input.cumsum(1); t_result = t_input.cumsum(1);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) { for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue; continue;
@ -65,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
assert(false); assert(false);
} }
cudaFree((void*)d_t_input); gpuFree((void*)d_t_input);
cudaFree((void*)d_t_result); gpuFree((void*)d_t_result);
} }
void test_cxx11_tensor_scan_cuda() void test_cxx11_tensor_scan_gpu()
{ {
CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128)); CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
} }