mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-23 10:09:36 +08:00
Merged in deven-amd/eigen (pull request PR-402)
Adding support for using Eigen in HIP kernels.
This commit is contained in:
commit
da0c604078
136
Eigen/Core
136
Eigen/Core
@ -22,6 +22,17 @@
|
||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
|
||||
// analogous to EIGEN_CUDACC, but for HIP
|
||||
#define EIGEN_HIPCC __HIPCC__
|
||||
#endif
|
||||
|
||||
// NVCC is not supported as the target platform for HIPCC
|
||||
// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
|
||||
#if defined(__NVCC__) && defined(__HIPCC__)
|
||||
#error "NVCC as the target platform for HIPCC is currently not supported."
|
||||
#endif
|
||||
|
||||
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
@ -32,8 +43,8 @@
|
||||
#endif
|
||||
|
||||
// Handle NVCC/CUDA/SYCL
|
||||
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
|
||||
// Do not try asserts on CUDA and SYCL!
|
||||
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC)
|
||||
// Do not try asserts on CUDA, HIP and SYCL!
|
||||
#ifndef EIGEN_NO_DEBUG
|
||||
#define EIGEN_NO_DEBUG
|
||||
#endif
|
||||
@ -71,6 +82,26 @@
|
||||
#define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
// Do not try to vectorize on HIP
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#endif
|
||||
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
// We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
|
||||
// works properly on the device side
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP)
|
||||
// analogous to EIGEN_CUDA_ARCH, but for HIP
|
||||
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
|
||||
// Note this check needs to come after we include hip_runtime.h since
|
||||
// hip_runtime.h includes hip_common.h which in turn has the define
|
||||
// for __HIP_DEVICE_COMPILE__
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
@ -81,16 +112,71 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// When compiling CUDA device code with NVCC, pull in math functions from the
|
||||
// global namespace. In host mode, and when device doee with clang, use the
|
||||
// std versions.
|
||||
#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
|
||||
|
||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
//
|
||||
// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
|
||||
//
|
||||
#define EIGEN_GPUCC
|
||||
//
|
||||
// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
|
||||
// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
|
||||
//
|
||||
// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
|
||||
// For those cases, the corresponding code should be guarded with
|
||||
// #if defined(EIGEN_GPUCC)
|
||||
// instead of
|
||||
// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||
//
|
||||
// For cases where the tweak is specific to HIP, the code should be guarded with
|
||||
// #if defined(EIGEN_HIPCC)
|
||||
//
|
||||
// For cases where the tweak is specific to CUDA, the code should be guarded with
|
||||
// #if defined(EIGEN_CUDACC)
|
||||
//
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
//
|
||||
// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
|
||||
//
|
||||
#define EIGEN_GPU_COMPILE_PHASE
|
||||
//
|
||||
// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
|
||||
// + one to compile the source for the "host" (ie CPU)
|
||||
// + another to compile the source for the "device" (ie. GPU)
|
||||
//
|
||||
// Code that needs to enabled only during the either the "host" or "device" compilation phase
|
||||
// needs to be guarded with a macro that indicates the current compilation phase
|
||||
//
|
||||
// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
|
||||
// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
|
||||
//
|
||||
// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
|
||||
// For those cases, the code should be guarded with
|
||||
// #if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// instead of
|
||||
// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
//
|
||||
// For cases where the tweak is specific to HIP, the code should be guarded with
|
||||
// #if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
//
|
||||
// For cases where the tweak is specific to CUDA, the code should be guarded with
|
||||
// #if defined(EIGEN_CUDA_ARCH)
|
||||
//
|
||||
#endif
|
||||
|
||||
|
||||
// When compiling CUDA device code with NVCC, or HIP device code with HIPCC
|
||||
// pull in math functions from the global namespace. In host mode, and when
|
||||
// device doee with clang, use the std versions.
|
||||
#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__))
|
||||
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
|
||||
#else
|
||||
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
|
||||
#endif
|
||||
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#define EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
|
||||
@ -271,7 +357,7 @@
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <vector_types.h>
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
@ -283,6 +369,27 @@
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
#define EIGEN_HAS_HIP_FP16
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
#define HIP_PATCH_WITH_NEW_FP16 18215
|
||||
#if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16)
|
||||
#define EIGEN_HAS_OLD_HIP_FP16
|
||||
// Old HIP implementation does not have a explicit typedef for "half2"
|
||||
typedef __half2 half2;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
#define EIGEN_HAS_GPU_FP16
|
||||
#endif
|
||||
|
||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||
#define EIGEN_HAS_OPENMP
|
||||
#endif
|
||||
@ -403,7 +510,6 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/util/IntegralConstant.h"
|
||||
#include "src/Core/util/SymbolicIndex.h"
|
||||
|
||||
|
||||
#include "src/Core/NumTraits.h"
|
||||
#include "src/Core/MathFunctions.h"
|
||||
#include "src/Core/GenericPacketMath.h"
|
||||
@ -447,13 +553,13 @@ using std::ptrdiff_t;
|
||||
#endif
|
||||
|
||||
// Half float support
|
||||
#include "src/Core/arch/CUDA/Half.h"
|
||||
#include "src/Core/arch/CUDA/PacketMathHalf.h"
|
||||
#include "src/Core/arch/CUDA/TypeCasting.h"
|
||||
#include "src/Core/arch/GPU/Half.h"
|
||||
#include "src/Core/arch/GPU/PacketMathHalf.h"
|
||||
#include "src/Core/arch/GPU/TypeCasting.h"
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
#include "src/Core/arch/CUDA/PacketMath.h"
|
||||
#include "src/Core/arch/CUDA/MathFunctions.h"
|
||||
#if defined EIGEN_VECTORIZE_GPU
|
||||
#include "src/Core/arch/GPU/PacketMath.h"
|
||||
#include "src/Core/arch/GPU/MathFunctions.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/arch/Default/Settings.h"
|
||||
|
@ -35,7 +35,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
|
||||
template<int Size, int MaxSize> struct product_size_category
|
||||
{
|
||||
enum {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
is_large = MaxSize == Dynamic ||
|
||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||
|
@ -303,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
|
||||
/** \internal tries to do cache prefetching of \a addr */
|
||||
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||
{
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// do nothing
|
||||
#elif defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(__LP64__)
|
||||
// 64-bit pointer operand constraint for inlined asm
|
||||
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
||||
@ -530,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
|
||||
***************************************************************************/
|
||||
|
||||
// Eigen+CUDA does not support complexes.
|
||||
#ifndef EIGEN_CUDACC
|
||||
#if !defined(EIGEN_GPUCC)
|
||||
|
||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
|
@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
||||
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T>
|
||||
struct real_impl<std::complex<T> >
|
||||
{
|
||||
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
||||
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T>
|
||||
struct imag_impl<std::complex<T> >
|
||||
{
|
||||
@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
|
||||
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T>
|
||||
struct conj_impl<std::complex<T> >
|
||||
{
|
||||
@ -435,7 +435,12 @@ struct round_retval
|
||||
struct arg_impl {
|
||||
static inline Scalar run(const Scalar& x)
|
||||
{
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// HIP does not seem to have a native device side implementation for the math routine "arg"
|
||||
using std::arg;
|
||||
#else
|
||||
EIGEN_USING_STD_MATH(arg);
|
||||
#endif
|
||||
return arg(x);
|
||||
}
|
||||
};
|
||||
@ -768,7 +773,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isfinite_impl(const T& x)
|
||||
{
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return (::isfinite)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isfinite;
|
||||
@ -783,7 +788,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isinf_impl(const T& x)
|
||||
{
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return (::isinf)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isinf;
|
||||
@ -798,7 +803,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isnan_impl(const T& x)
|
||||
{
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return (::isnan)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isnan;
|
||||
@ -864,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
|
||||
|
||||
namespace numext {
|
||||
|
||||
#if (!defined(EIGEN_CUDACC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||
@ -977,7 +982,12 @@ template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
|
||||
{
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// no "fminl" on HIP yet
|
||||
return (x < y) ? x : y;
|
||||
#else
|
||||
return fminl(x, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
@ -1002,7 +1012,12 @@ template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
|
||||
{
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// no "fmaxl" on HIP yet
|
||||
return (x > y) ? x : y;
|
||||
#else
|
||||
return fmaxl(x, y);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1099,7 +1114,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
|
||||
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log1p(const float &x) { return ::log1pf(x); }
|
||||
|
||||
@ -1157,7 +1172,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
|
||||
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float floor(const float &x) { return ::floorf(x); }
|
||||
|
||||
@ -1178,7 +1193,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
|
||||
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float ceil(const float &x) { return ::ceilf(x); }
|
||||
|
||||
@ -1236,7 +1251,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log(const float &x) { return ::logf(x); }
|
||||
|
||||
@ -1264,7 +1279,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float abs(const float &x) { return ::fabsf(x); }
|
||||
|
||||
@ -1294,7 +1309,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
|
||||
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float exp(const float &x) { return ::expf(x); }
|
||||
|
||||
@ -1330,7 +1345,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
|
||||
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float expm1(const float &x) { return ::expm1f(x); }
|
||||
|
||||
@ -1350,7 +1365,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
|
||||
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cos(const float &x) { return ::cosf(x); }
|
||||
|
||||
@ -1370,7 +1385,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
|
||||
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sin(const float &x) { return ::sinf(x); }
|
||||
|
||||
@ -1390,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
|
||||
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tan(const float &x) { return ::tanf(x); }
|
||||
|
||||
@ -1421,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
|
||||
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float acos(const float &x) { return ::acosf(x); }
|
||||
|
||||
@ -1452,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
|
||||
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float asin(const float &x) { return ::asinf(x); }
|
||||
|
||||
@ -1483,7 +1498,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
|
||||
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float atan(const float &x) { return ::atanf(x); }
|
||||
|
||||
@ -1504,7 +1519,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
|
||||
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cosh(const float &x) { return ::coshf(x); }
|
||||
|
||||
@ -1524,7 +1539,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
|
||||
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sinh(const float &x) { return ::sinhf(x); }
|
||||
|
||||
@ -1542,12 +1557,12 @@ T tanh(const T &x) {
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
||||
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
||||
#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
|
||||
#elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(const float &x) { return ::tanhf(x); }
|
||||
|
||||
@ -1567,7 +1582,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
|
||||
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef EIGEN_CUDACC
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float fmod(const float& a, const float& b) {
|
||||
|
@ -885,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
||||
}
|
||||
|
||||
#ifndef EIGEN_CUDACC
|
||||
#ifndef EIGEN_GPUCC
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
{
|
||||
@ -929,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
||||
}
|
||||
|
||||
#ifndef EIGEN_CUDACC
|
||||
#ifndef EIGEN_GPUCC
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
{
|
||||
|
@ -26,15 +26,15 @@
|
||||
|
||||
|
||||
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
|
||||
// type Eigen::half (inheriting from CUDA's __half struct) with
|
||||
// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
|
||||
// operator overloads such that it behaves basically as an arithmetic
|
||||
// type. It will be quite slow on CPUs (so it is recommended to stay
|
||||
// in fp32 for CPUs, except for simple parameter conversions, I/O
|
||||
// to disk and the likes), but fast on GPUs.
|
||||
|
||||
|
||||
#ifndef EIGEN_HALF_CUDA_H
|
||||
#define EIGEN_HALF_CUDA_H
|
||||
#ifndef EIGEN_HALF_GPU_H
|
||||
#define EIGEN_HALF_GPU_H
|
||||
|
||||
#if __cplusplus > 199711L
|
||||
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
|
||||
@ -49,16 +49,41 @@ struct half;
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if !defined(EIGEN_HAS_GPU_FP16)
|
||||
// Make our own __half_raw definition that is similar to CUDA's.
|
||||
struct __half_raw {
|
||||
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||
unsigned short x;
|
||||
};
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
// Make a __half_raw definition that is
|
||||
// ++ compatible with that of Eigen and
|
||||
// ++ add a implcit conversion to the native __half of the old HIP implementation.
|
||||
//
|
||||
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
|
||||
//
|
||||
// In the old HIP implementation,
|
||||
// ++ __half is a typedef of __fp16
|
||||
// ++ the "__h*" routines take "__half" arguments
|
||||
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
|
||||
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
|
||||
struct __half_raw {
|
||||
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||
union {
|
||||
unsigned short x;
|
||||
__half data;
|
||||
};
|
||||
operator __half(void) const { return data; }
|
||||
};
|
||||
#endif
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
typedef __half __half_raw;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
||||
@ -69,8 +94,19 @@ struct half_base : public __half_raw {
|
||||
EIGEN_DEVICE_FUNC half_base() {}
|
||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
|
||||
#endif
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -78,17 +114,38 @@ struct half_base : public __half_raw {
|
||||
|
||||
// Class definition.
|
||||
struct half : public half_impl::half_base {
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
|
||||
// Writing this out as separate #if-else blocks to make the code easier to follow
|
||||
// The same applies to most #if-else blocks in this file
|
||||
#if !defined(EIGEN_HAS_GPU_FP16)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
|
||||
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC half() {}
|
||||
|
||||
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||
@ -201,7 +258,8 @@ namespace Eigen {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
||||
|
||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||
@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
||||
|
||||
#else // Emulate support for half floats
|
||||
|
||||
// Definitions for CPUs and older CUDA, mostly working through conversion
|
||||
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
|
||||
// to/from fp32.
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
|
||||
@ -342,7 +400,8 @@ union FP32 {
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
__half tmp_ff = __float2half(ff);
|
||||
return *(__half_raw*)&tmp_ff;
|
||||
|
||||
@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __half2float(h);
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
|
||||
return (a.x & 0x7fff) == 0x7c00;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hisnan(a);
|
||||
#else
|
||||
return (a.x & 0x7fff) > 0x7c00;
|
||||
@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
return result;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
|
||||
return half(numext::expm1(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return half(::hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
||||
return half(::log10f(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
||||
return half(::tanhf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hlt(b, a) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hlt(a, b) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
||||
return Eigen::half(::expf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return Eigen::half(::hlog(a));
|
||||
#else
|
||||
return Eigen::half(::logf(float(a)));
|
||||
@ -629,9 +698,12 @@ struct hash<Eigen::half> {
|
||||
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
#if EIGEN_CUDACC_VER < 90000
|
||||
#if (EIGEN_CUDACC_VER < 90000) || \
|
||||
defined(EIGEN_HAS_HIP_FP16)
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
#else
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||
@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
|
||||
#endif
|
||||
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(
|
||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||
@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
namespace Eigen {
|
||||
namespace numext {
|
||||
|
||||
@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) {
|
||||
} // namespace numext
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_HALF_CUDA_H
|
||||
#endif // EIGEN_HALF_GPU_H
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#define EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
|
||||
#define EIGEN_MATH_FUNCTIONS_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float4 plog<float4>(const float4& a)
|
||||
{
|
||||
@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#endif // EIGEN_MATH_FUNCTIONS_GPU_H
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_CUDA_H
|
||||
#ifndef EIGEN_PACKET_MATH_GPU_H
|
||||
#define EIGEN_PACKET_MATH_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||
|
||||
@ -338,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_CUDA_H
|
||||
#endif // EIGEN_PACKET_MATH_GPU_H
|
@ -7,15 +7,16 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
|
||||
#define EIGEN_PACKET_MATH_HALF_GPU_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// Most of the following operations require arch >= 3.0
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||
|
||||
@ -43,7 +44,18 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
||||
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return half2half2(from);
|
||||
#else
|
||||
return __half2half2(from);
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
return __half2half2(from);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return __halves2half2((*(from+0)), (*(from+1)));
|
||||
#else
|
||||
return __ldg((const half2*)from);
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const half2*)from);
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return __halves2half2((*(from+0)), (*(from+1)));
|
||||
#else
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 350
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
@ -117,15 +155,29 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
float f = __half2float(a) + 1.0f;
|
||||
return __halves2half2(a, __float2half(f));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hadd2(a, b);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
|
||||
float r2 = a2 + b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hsub2(a, b);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
|
||||
float r2 = a2 - b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hneg2(a);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hneg2(a);
|
||||
#else
|
||||
@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
float a2 = __high2float(a);
|
||||
return __floats2half2_rn(-a1, -a2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hmul2(a, b);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
|
||||
float r2 = a2 * b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hfma2(a, b, c);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
|
||||
float r2 = a2 * b2 + c2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return h2div(a, b);
|
||||
#else
|
||||
return __h2div(a, b);
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(__float2half(a1 + a2));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
|
||||
float a2 = __high2float(a);
|
||||
return a1 > a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
|
||||
float a2 = __high2float(a);
|
||||
return a1 < a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(__float2half(a1 * a2));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
half2 plog<half2>(const half2& a) {
|
||||
@ -1281,4 +1416,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#endif // EIGEN_PACKET_MATH_HALF_GPU_H
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TYPE_CASTING_CUDA_H
|
||||
#define EIGEN_TYPE_CASTING_CUDA_H
|
||||
#ifndef EIGEN_TYPE_CASTING_GPU_H
|
||||
#define EIGEN_TYPE_CASTING_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef Eigen::half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __float2half(a);
|
||||
#else
|
||||
return Eigen::half(a);
|
||||
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef Eigen::half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __float2half(static_cast<float>(a));
|
||||
#else
|
||||
return Eigen::half(static_cast<float>(a));
|
||||
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef float result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __half2float(a);
|
||||
#else
|
||||
return static_cast<float>(a);
|
||||
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
|
||||
|
||||
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<Eigen::half, float> {
|
||||
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_TYPE_CASTING_CUDA_H
|
||||
#endif // EIGEN_TYPE_CASTING_GPU_H
|
23
Eigen/src/Core/arch/HIP/hcc/math_constants.h
Normal file
23
Eigen/src/Core/arch/HIP/hcc/math_constants.h
Normal file
@ -0,0 +1,23 @@
|
||||
/*
|
||||
* math_constants.h -
|
||||
* HIP equivalent of the CUDA header of the same name
|
||||
*/
|
||||
|
||||
#ifndef __MATH_CONSTANTS_H__
|
||||
#define __MATH_CONSTANTS_H__
|
||||
|
||||
/* single precision constants */
|
||||
|
||||
#define HIPRT_INF_F __int_as_float(0x7f800000)
|
||||
#define HIPRT_NAN_F __int_as_float(0x7fffffff)
|
||||
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
|
||||
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
|
||||
#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
|
||||
#define HIPRT_ZERO_F 0.0f
|
||||
#define HIPRT_ONE_F 1.0f
|
||||
|
||||
/* double precision constants */
|
||||
#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
|
||||
#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
|
||||
|
||||
#endif
|
@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
|
||||
{
|
||||
#ifdef EIGEN_CUDACC
|
||||
#ifdef EIGEN_GPUCC
|
||||
// FIXME is there some kind of cuda::swap?
|
||||
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
|
||||
#else
|
||||
|
@ -436,7 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
|
||||
typedef typename BinaryOp::second_argument_type second_argument_type;
|
||||
typedef typename BinaryOp::result_type result_type;
|
||||
|
||||
bind1st_op(const first_argument_type &val) : m_value(val) {}
|
||||
EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
|
||||
|
||||
@ -455,7 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
|
||||
typedef typename BinaryOp::second_argument_type second_argument_type;
|
||||
typedef typename BinaryOp::result_type result_type;
|
||||
|
||||
bind2nd_op(const second_argument_type &val) : m_value(val) {}
|
||||
EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
|
||||
|
||||
|
@ -48,7 +48,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
||||
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
||||
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
||||
|
||||
EIGEN_DONT_INLINE static void run(
|
||||
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
|
||||
Index rows, Index cols,
|
||||
const LhsMapper& lhs,
|
||||
const RhsMapper& rhs,
|
||||
@ -57,7 +57,7 @@ EIGEN_DONT_INLINE static void run(
|
||||
};
|
||||
|
||||
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
||||
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
||||
Index rows, Index cols,
|
||||
const LhsMapper& alhs,
|
||||
const RhsMapper& rhs,
|
||||
@ -231,7 +231,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
||||
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
||||
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
||||
|
||||
EIGEN_DONT_INLINE static void run(
|
||||
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
|
||||
Index rows, Index cols,
|
||||
const LhsMapper& lhs,
|
||||
const RhsMapper& rhs,
|
||||
@ -240,7 +240,7 @@ EIGEN_DONT_INLINE static void run(
|
||||
};
|
||||
|
||||
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
||||
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
||||
Index rows, Index cols,
|
||||
const LhsMapper& alhs,
|
||||
const RhsMapper& rhs,
|
||||
|
@ -1008,9 +1008,12 @@ namespace Eigen {
|
||||
# define EIGEN_TRY try
|
||||
# define EIGEN_CATCH(X) catch (X)
|
||||
#else
|
||||
# ifdef EIGEN_CUDA_ARCH
|
||||
# if defined(EIGEN_CUDA_ARCH)
|
||||
# define EIGEN_THROW_X(X) asm("trap;")
|
||||
# define EIGEN_THROW asm("trap;")
|
||||
# elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
# define EIGEN_THROW_X(X) asm("s_trap 0")
|
||||
# define EIGEN_THROW asm("s_trap 0")
|
||||
# else
|
||||
# define EIGEN_THROW_X(X) std::abort()
|
||||
# define EIGEN_THROW std::abort()
|
||||
|
@ -70,7 +70,20 @@ inline void throw_std_bad_alloc()
|
||||
throw std::bad_alloc();
|
||||
#else
|
||||
std::size_t huge = static_cast<std::size_t>(-1);
|
||||
#if defined(EIGEN_HIPCC)
|
||||
//
|
||||
// calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
|
||||
// and as a consequence the code in the #else block triggers the hipcc warning :
|
||||
// "no overloaded function has restriction specifiers that are compatible with the ambient context"
|
||||
//
|
||||
// "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
|
||||
// the same on "operator new"
|
||||
// Reverting code back to the old version in this #if block for the hipcc compiler
|
||||
//
|
||||
new int[huge];
|
||||
#else
|
||||
::operator new(huge);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
|
||||
|
||||
void *result;
|
||||
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
result = ::malloc(size);
|
||||
#else
|
||||
result = std::malloc(size);
|
||||
#endif
|
||||
|
||||
#if EIGEN_DEFAULT_ALIGN_BYTES==16
|
||||
eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
|
||||
#endif
|
||||
@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
|
||||
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
|
||||
{
|
||||
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
::free(ptr);
|
||||
#else
|
||||
std::free(ptr);
|
||||
#endif
|
||||
|
||||
#else
|
||||
handmade_aligned_free(ptr);
|
||||
#endif
|
||||
@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
|
||||
{
|
||||
check_that_malloc_is_allowed();
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
void *result = ::malloc(size);
|
||||
#else
|
||||
void *result = std::malloc(size);
|
||||
#endif
|
||||
|
||||
if(!result && size)
|
||||
throw_std_bad_alloc();
|
||||
return result;
|
||||
@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
|
||||
{
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
::free(ptr);
|
||||
#else
|
||||
std::free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
|
||||
@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> {
|
||||
IntPtr size = IntPtr(end)-IntPtr(start);
|
||||
if(size==0) return;
|
||||
eigen_internal_assert(start!=0 && end!=0 && target!=0);
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
::memcpy(target, start, size);
|
||||
#else
|
||||
std::memcpy(target, start, size);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -11,9 +11,18 @@
|
||||
#ifndef EIGEN_META_H
|
||||
#define EIGEN_META_H
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#include <cfloat>
|
||||
#include <math_constants.h>
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#include <math_constants.h>
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
|
||||
@ -177,7 +186,7 @@ template<bool Condition, typename T=void> struct enable_if;
|
||||
template<typename T> struct enable_if<true,T>
|
||||
{ typedef T type; };
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
#if !defined(__FLT_EPSILON__)
|
||||
#define __FLT_EPSILON__ FLT_EPSILON
|
||||
#define __DBL_EPSILON__ DBL_EPSILON
|
||||
@ -199,13 +208,31 @@ template<> struct numeric_limits<float>
|
||||
EIGEN_DEVICE_FUNC
|
||||
static float epsilon() { return __FLT_EPSILON__; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
static float (max)() { return CUDART_MAX_NORMAL_F; }
|
||||
static float (max)() {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return CUDART_MAX_NORMAL_F;
|
||||
#else
|
||||
return HIPRT_MAX_NORMAL_F;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
static float (min)() { return FLT_MIN; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
static float infinity() { return CUDART_INF_F; }
|
||||
static float infinity() {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return CUDART_INF_F;
|
||||
#else
|
||||
return HIPRT_INF_F;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
static float quiet_NaN() { return CUDART_NAN_F; }
|
||||
static float quiet_NaN() {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return CUDART_NAN_F;
|
||||
#else
|
||||
return HIPRT_NAN_F;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
template<> struct numeric_limits<double>
|
||||
{
|
||||
@ -216,9 +243,21 @@ template<> struct numeric_limits<double>
|
||||
EIGEN_DEVICE_FUNC
|
||||
static double (min)() { return DBL_MIN; }
|
||||
EIGEN_DEVICE_FUNC
|
||||
static double infinity() { return CUDART_INF; }
|
||||
static double infinity() {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return CUDART_INF;
|
||||
#else
|
||||
return HIPRT_INF;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC
|
||||
static double quiet_NaN() { return CUDART_NAN; }
|
||||
static double quiet_NaN() {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
return CUDART_NAN;
|
||||
#else
|
||||
return HIPRT_NAN;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
template<> struct numeric_limits<int>
|
||||
{
|
||||
@ -531,13 +570,13 @@ template<typename T, typename U> struct scalar_product_traits
|
||||
|
||||
namespace numext {
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
||||
#else
|
||||
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
using internal::device::numeric_limits;
|
||||
#else
|
||||
using std::numeric_limits;
|
||||
@ -557,7 +596,7 @@ T div_ceil(const T &a, const T &b)
|
||||
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool equal_strict(const X& x,const Y& y) { return x == y; }
|
||||
|
||||
#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
|
||||
|
||||
@ -568,7 +607,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double
|
||||
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool not_equal_strict(const X& x,const Y& y) { return x != y; }
|
||||
|
||||
#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)
|
||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
|
||||
|
||||
|
@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
|
||||
#endif
|
||||
}//end deflation
|
||||
|
||||
#ifndef EIGEN_CUDACC
|
||||
#if !defined(EIGEN_GPUCC)
|
||||
/** \svd_module
|
||||
*
|
||||
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
|
||||
|
@ -19,7 +19,10 @@ macro(ei_add_test_internal testname testname_with_suffix)
|
||||
endif()
|
||||
|
||||
if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
|
||||
if(EIGEN_TEST_CUDA_CLANG)
|
||||
if(EIGEN_TEST_HIP)
|
||||
hip_reset_flags()
|
||||
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
|
||||
elseif(EIGEN_TEST_CUDA_CLANG)
|
||||
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
|
||||
if(CUDA_64_BIT_DEVICE_CODE)
|
||||
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
|
||||
@ -491,6 +494,11 @@ macro(ei_testing_print_summary)
|
||||
else()
|
||||
message(STATUS "CUDA: OFF")
|
||||
endif()
|
||||
if(EIGEN_TEST_HIP)
|
||||
message(STATUS "HIP: ON (using hipcc)")
|
||||
else()
|
||||
message(STATUS "HIP: OFF")
|
||||
endif()
|
||||
|
||||
endif() # vectorization / alignment options
|
||||
|
||||
|
@ -399,7 +399,7 @@ if(CUDA_FOUND)
|
||||
cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
|
||||
ei_add_test(cuda_basic)
|
||||
ei_add_test(gpu_basic)
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
|
||||
@ -408,6 +408,48 @@ endif(CUDA_FOUND)
|
||||
endif(EIGEN_TEST_CUDA)
|
||||
|
||||
|
||||
# HIP unit tests
|
||||
option(EIGEN_TEST_HIP "Add HIP support." OFF)
|
||||
if (EIGEN_TEST_HIP)
|
||||
|
||||
set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
|
||||
|
||||
if (EXISTS ${HIP_PATH})
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
|
||||
|
||||
find_package(HIP REQUIRED)
|
||||
if (HIP_FOUND)
|
||||
|
||||
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
|
||||
|
||||
if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${HIP_PATH}/include)
|
||||
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
ei_add_test(gpu_basic)
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
|
||||
elseif (${HIP_PLATFORM} STREQUAL "nvcc")
|
||||
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
|
||||
else ()
|
||||
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
|
||||
endif()
|
||||
|
||||
endif(HIP_FOUND)
|
||||
|
||||
else ()
|
||||
|
||||
message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
|
||||
|
||||
endif()
|
||||
|
||||
endif(EIGEN_TEST_HIP)
|
||||
|
||||
|
||||
|
||||
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)
|
||||
add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)
|
||||
|
||||
|
@ -15,13 +15,11 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cuda_basic
|
||||
#define EIGEN_TEST_FUNC gpu_basic
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
|
||||
#include <math_constants.h>
|
||||
#include <cuda.h>
|
||||
#include "main.h"
|
||||
#include "cuda_common.h"
|
||||
#include "gpu_common.h"
|
||||
|
||||
// Check that dense modules can be properly parsed by nvcc
|
||||
#include <Eigen/Dense>
|
||||
@ -164,40 +162,51 @@ struct matrix_inverse {
|
||||
}
|
||||
};
|
||||
|
||||
void test_cuda_basic()
|
||||
void test_gpu_basic()
|
||||
{
|
||||
ei_test_init_cuda();
|
||||
ei_test_init_gpu();
|
||||
|
||||
int nthreads = 100;
|
||||
Eigen::VectorXf in, out;
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
||||
int data_size = nthreads * 512;
|
||||
in.setRandom(data_size);
|
||||
out.setRandom(data_size);
|
||||
#endif
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Vector3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise<Array44f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array4f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(replicate<Array33f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(redux<Array4f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(redux<Matrix3f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(matrix_inverse<Matrix4f>(), nthreads, in, out) );
|
||||
#if !defined(EIGEN_USE_HIP)
|
||||
// FIXME
|
||||
// These subtests result in a compile failure on the HIP platform
|
||||
//
|
||||
// eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
|
||||
// base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
|
||||
// (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
|
||||
#endif
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues<Matrix4f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
|
||||
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
|
||||
|
||||
#if !defined(EIGEN_USE_HIP)
|
||||
// FIXME
|
||||
// These subtests result in a linking error on the HIP platform
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
|
||||
#endif
|
||||
}
|
@ -1,13 +1,22 @@
|
||||
|
||||
#ifndef EIGEN_TEST_CUDA_COMMON_H
|
||||
#define EIGEN_TEST_CUDA_COMMON_H
|
||||
#ifndef EIGEN_TEST_GPU_COMMON_H
|
||||
#define EIGEN_TEST_GPU_COMMON_H
|
||||
|
||||
#ifdef EIGEN_USE_HIP
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#else
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <iostream>
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#define EIGEN_USE_GPU
|
||||
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
#if !defined(__CUDACC__) && !defined(__HIPCC__)
|
||||
dim3 threadIdx, blockDim, blockIdx;
|
||||
#endif
|
||||
|
||||
@ -21,7 +30,7 @@ void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
|
||||
|
||||
template<typename Kernel, typename Input, typename Output>
|
||||
__global__
|
||||
void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
|
||||
void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
|
||||
{
|
||||
int i = threadIdx.x + blockIdx.x*blockDim.x;
|
||||
if(i<n) {
|
||||
@ -31,61 +40,70 @@ void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* o
|
||||
|
||||
|
||||
template<typename Kernel, typename Input, typename Output>
|
||||
void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out)
|
||||
void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
|
||||
{
|
||||
typename Input::Scalar* d_in;
|
||||
typename Output::Scalar* d_out;
|
||||
std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar);
|
||||
std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
|
||||
|
||||
cudaMalloc((void**)(&d_in), in_bytes);
|
||||
cudaMalloc((void**)(&d_out), out_bytes);
|
||||
gpuMalloc((void**)(&d_in), in_bytes);
|
||||
gpuMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
|
||||
gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
// Simple and non-optimal 1D mapping assuming n is not too large
|
||||
// That's only for unit testing!
|
||||
dim3 Blocks(128);
|
||||
dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
|
||||
|
||||
cudaThreadSynchronize();
|
||||
run_on_cuda_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
|
||||
cudaThreadSynchronize();
|
||||
gpuDeviceSynchronize();
|
||||
|
||||
#ifdef EIGEN_USE_HIP
|
||||
hipLaunchKernelGGL(run_on_gpu_meta_kernel<Kernel,
|
||||
typename std::decay<decltype(*d_in)>::type,
|
||||
typename std::decay<decltype(*d_out)>::type>,
|
||||
dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
|
||||
#else
|
||||
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
|
||||
#endif
|
||||
|
||||
gpuDeviceSynchronize();
|
||||
|
||||
// check inputs have not been modified
|
||||
cudaMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost);
|
||||
gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
|
||||
gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(d_in);
|
||||
cudaFree(d_out);
|
||||
gpuFree(d_in);
|
||||
gpuFree(d_out);
|
||||
}
|
||||
|
||||
|
||||
template<typename Kernel, typename Input, typename Output>
|
||||
void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out)
|
||||
void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out)
|
||||
{
|
||||
Input in_ref, in_cuda;
|
||||
Output out_ref, out_cuda;
|
||||
#ifndef __CUDA_ARCH__
|
||||
in_ref = in_cuda = in;
|
||||
out_ref = out_cuda = out;
|
||||
Input in_ref, in_gpu;
|
||||
Output out_ref, out_gpu;
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
||||
in_ref = in_gpu = in;
|
||||
out_ref = out_gpu = out;
|
||||
#endif
|
||||
run_on_cpu (ker, n, in_ref, out_ref);
|
||||
run_on_cuda(ker, n, in_cuda, out_cuda);
|
||||
#ifndef __CUDA_ARCH__
|
||||
VERIFY_IS_APPROX(in_ref, in_cuda);
|
||||
VERIFY_IS_APPROX(out_ref, out_cuda);
|
||||
run_on_gpu(ker, n, in_gpu, out_gpu);
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
||||
VERIFY_IS_APPROX(in_ref, in_gpu);
|
||||
VERIFY_IS_APPROX(out_ref, out_gpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void ei_test_init_cuda()
|
||||
void ei_test_init_gpu()
|
||||
{
|
||||
int device = 0;
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, device);
|
||||
std::cout << "CUDA device info:\n";
|
||||
gpuDeviceProp_t deviceProp;
|
||||
gpuGetDeviceProperties(&deviceProp, device);
|
||||
std::cout << "GPU device info:\n";
|
||||
std::cout << " name: " << deviceProp.name << "\n";
|
||||
std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n";
|
||||
std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n";
|
||||
@ -98,4 +116,4 @@ void ei_test_init_cuda()
|
||||
std::cout << " computeMode: " << deviceProp.computeMode << "\n";
|
||||
}
|
||||
|
||||
#endif // EIGEN_TEST_CUDA_COMMON_H
|
||||
#endif // EIGEN_TEST_GPU_COMMON_H
|
@ -9,7 +9,7 @@
|
||||
|
||||
#include "main.h"
|
||||
|
||||
#include <Eigen/src/Core/arch/CUDA/Half.h>
|
||||
#include <Eigen/src/Core/arch/GPU/Half.h>
|
||||
|
||||
// Make sure it's possible to forward declare Eigen::half
|
||||
namespace Eigen {
|
||||
|
32
test/main.h
32
test/main.h
@ -67,11 +67,27 @@
|
||||
// protected by parenthesis against macro expansion, the min()/max() macros
|
||||
// are defined here and any not-parenthesized min/max call will cause a
|
||||
// compiler error.
|
||||
#define min(A,B) please_protect_your_min_with_parentheses
|
||||
#define max(A,B) please_protect_your_max_with_parentheses
|
||||
#define isnan(X) please_protect_your_isnan_with_parentheses
|
||||
#define isinf(X) please_protect_your_isinf_with_parentheses
|
||||
#define isfinite(X) please_protect_your_isfinite_with_parentheses
|
||||
#if !defined(__HIPCC__)
|
||||
//
|
||||
// HIP header files include the following files
|
||||
// <thread>
|
||||
// <regex>
|
||||
// <unordered_map>
|
||||
// which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail
|
||||
//
|
||||
// Including those header files before the following macro definition for "min" / "max", only partially resolves the issue
|
||||
// This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other
|
||||
// headers.
|
||||
//
|
||||
// So instead choosing to simply disable this check for HIP
|
||||
//
|
||||
#define min(A,B) please_protect_your_min_with_parentheses
|
||||
#define max(A,B) please_protect_your_max_with_parentheses
|
||||
#define isnan(X) please_protect_your_isnan_with_parentheses
|
||||
#define isinf(X) please_protect_your_isinf_with_parentheses
|
||||
#define isfinite(X) please_protect_your_isfinite_with_parentheses
|
||||
#endif
|
||||
|
||||
#ifdef M_PI
|
||||
#undef M_PI
|
||||
#endif
|
||||
@ -154,7 +170,7 @@ namespace Eigen
|
||||
|
||||
#define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "")
|
||||
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
||||
#define EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
|
||||
@ -233,7 +249,7 @@ namespace Eigen
|
||||
}
|
||||
#endif //EIGEN_EXCEPTIONS
|
||||
|
||||
#elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
|
||||
#elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS
|
||||
// see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
|
||||
#define eigen_assert(a) \
|
||||
if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
|
||||
@ -290,7 +306,7 @@ namespace Eigen
|
||||
std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
|
||||
#endif
|
||||
|
||||
#if !defined(__CUDACC__)
|
||||
#if !defined(__CUDACC__) && !defined(__HIPCC__)
|
||||
#define EIGEN_USE_CUSTOM_ASSERT
|
||||
#endif
|
||||
|
||||
|
@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t;
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
#include <iostream>
|
||||
#include <cuda_runtime.h>
|
||||
#if __cplusplus >= 201103L
|
||||
#include <atomic>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <iostream>
|
||||
#if defined(EIGEN_USE_HIP)
|
||||
#include <hip/hip_runtime.h>
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
#if __cplusplus >= 201103L
|
||||
#include <atomic>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "src/Tensor/TensorMacros.h"
|
||||
@ -95,7 +99,7 @@ typedef unsigned __int64 uint64_t;
|
||||
#include "src/Tensor/TensorCostModel.h"
|
||||
#include "src/Tensor/TensorDeviceDefault.h"
|
||||
#include "src/Tensor/TensorDeviceThreadPool.h"
|
||||
#include "src/Tensor/TensorDeviceCuda.h"
|
||||
#include "src/Tensor/TensorDeviceGpu.h"
|
||||
#include "src/Tensor/TensorDeviceSycl.h"
|
||||
#include "src/Tensor/TensorIndexList.h"
|
||||
#include "src/Tensor/TensorDimensionList.h"
|
||||
@ -112,14 +116,14 @@ typedef unsigned __int64 uint64_t;
|
||||
#include "src/Tensor/TensorEvaluator.h"
|
||||
#include "src/Tensor/TensorExpr.h"
|
||||
#include "src/Tensor/TensorReduction.h"
|
||||
#include "src/Tensor/TensorReductionCuda.h"
|
||||
#include "src/Tensor/TensorReductionGpu.h"
|
||||
#include "src/Tensor/TensorArgMax.h"
|
||||
#include "src/Tensor/TensorConcatenation.h"
|
||||
#include "src/Tensor/TensorContractionMapper.h"
|
||||
#include "src/Tensor/TensorContractionBlocking.h"
|
||||
#include "src/Tensor/TensorContraction.h"
|
||||
#include "src/Tensor/TensorContractionThreadPool.h"
|
||||
#include "src/Tensor/TensorContractionCuda.h"
|
||||
#include "src/Tensor/TensorContractionGpu.h"
|
||||
#include "src/Tensor/TensorConversion.h"
|
||||
#include "src/Tensor/TensorConvolution.h"
|
||||
#include "src/Tensor/TensorFFT.h"
|
||||
|
@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase
|
||||
}
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
void evalGemv(Scalar* buffer) const {
|
||||
const Index rows = m_i_size;
|
||||
const Index cols = m_k_size;
|
||||
|
||||
@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase
|
||||
}
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
void evalGemm(Scalar* buffer) const {
|
||||
#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
|
||||
if (m_can_use_xsmm) {
|
||||
evalGemmXSMM(buffer);
|
||||
|
@ -28,7 +28,24 @@ class TensorContractionBlocking {
|
||||
typedef typename LhsMapper::Scalar LhsScalar;
|
||||
typedef typename RhsMapper::Scalar RhsScalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
|
||||
/*
|
||||
adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
|
||||
requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
|
||||
which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
|
||||
which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
|
||||
(else HIPCC will error out)
|
||||
|
||||
However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
|
||||
results in NVCC erroring out with the following error
|
||||
|
||||
../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
|
||||
dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
|
||||
*/
|
||||
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
|
||||
kc_(k), mc_(m), nc_(n)
|
||||
{
|
||||
if (ShardingType == ShardByCol) {
|
||||
|
@ -9,10 +9,10 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
// the sum across all big k blocks of the product of little k block of index (x, y)
|
||||
// with block of index (y, z). To compute the final output, we need to reduce
|
||||
// the 8 threads over y by summation.
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
||||
#else
|
||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
|
||||
@ -503,7 +503,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
template<typename Scalar, typename Index, typename LhsMapper,
|
||||
typename RhsMapper, typename OutputMapper>
|
||||
__global__ void
|
||||
#if defined(EIGEN_HIPCC)
|
||||
__launch_bounds__(512, 1)
|
||||
#else
|
||||
__launch_bounds__(512)
|
||||
#endif
|
||||
EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
|
||||
const OutputMapper output,
|
||||
const Index m_size, const Index n_size, const Index k_size) {
|
||||
@ -542,7 +546,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
||||
results[i].x = results[i].y = results[i].z = results[i].w = 0;
|
||||
}
|
||||
|
||||
|
||||
#define prefetch_lhs(reg, row, col) \
|
||||
if (!CHECK_LHS_BOUNDARY) { \
|
||||
if (col < k_size) { \
|
||||
@ -563,12 +566,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
||||
reg.x =lhs(row + 0, col); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
|
||||
} \
|
||||
|
||||
Index lhs_vert = base_m+threadIdx.x*4;
|
||||
|
||||
for (Index k = 0; k < k_size; k += 16) {
|
||||
|
||||
lhs_pf0 = internal::pset1<float4>(0);
|
||||
rhs_pf0 = internal::pset1<float4>(0);
|
||||
|
||||
@ -618,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
||||
x1 = rhs_pf0.x;
|
||||
x2 = rhs_pf0.z;
|
||||
}
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
x1 = __shfl_xor(x1, 4);
|
||||
x2 = __shfl_xor(x2, 4);
|
||||
#else
|
||||
@ -695,7 +698,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
||||
|
||||
#undef prefetch_lhs
|
||||
#undef add_vals
|
||||
|
||||
|
||||
Index horiz_base = threadIdx.y*4+base_n;
|
||||
if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
@ -784,7 +787,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
results[i].x = results[i].y = results[i].z = results[i].w = 0;
|
||||
}
|
||||
|
||||
|
||||
Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
|
||||
for (Index k = 0; k < k_size; k += 32) {
|
||||
lhs_pf0 = internal::pset1<float4>(0);
|
||||
@ -1069,7 +1071,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
__syncthreads();
|
||||
} // end loop over k
|
||||
|
||||
|
||||
__syncthreads();
|
||||
Index horiz_base = (threadIdx.y/4)*8+base_n;
|
||||
if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
|
||||
@ -1134,7 +1135,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
template<typename Index, typename LhsMapper,
|
||||
typename RhsMapper, typename OutputMapper>
|
||||
__global__ void
|
||||
#if defined(EIGEN_HIPCC)
|
||||
__launch_bounds__(256, 1)
|
||||
#else
|
||||
__launch_bounds__(256)
|
||||
#endif
|
||||
EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
|
||||
const OutputMapper output,
|
||||
const Index m_size, const Index n_size, const Index k_size) {
|
||||
@ -1177,7 +1182,11 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
|
||||
template<typename Index, typename LhsMapper,
|
||||
typename RhsMapper, typename OutputMapper>
|
||||
__global__ void
|
||||
#if defined(EIGEN_HIPCC)
|
||||
__launch_bounds__(256, 1)
|
||||
#else
|
||||
__launch_bounds__(256)
|
||||
#endif
|
||||
EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
|
||||
const OutputMapper output,
|
||||
const Index m_size, const Index n_size, const Index k_size) {
|
||||
@ -1323,7 +1332,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(8, 8, 8);
|
||||
LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
}
|
||||
};
|
||||
|
||||
@ -1334,13 +1343,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(16, 16, 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
} else {
|
||||
const Index m_blocks = (m + 127) / 128;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(8, 32, 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -1384,12 +1393,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
|
||||
OutputMapper output(buffer, m);
|
||||
|
||||
setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
|
||||
#if defined(EIGEN_USE_HIP)
|
||||
setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
|
||||
#else
|
||||
setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
|
||||
#endif
|
||||
|
||||
LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_USE_GPU and EIGEN_CUDACC
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
|
||||
#endif // EIGEN_USE_GPU and EIGEN_GPUCC
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
|
@ -54,8 +54,8 @@ class IndexMapper {
|
||||
}
|
||||
}
|
||||
|
||||
array<Index, NumDims> cudaInputDimensions;
|
||||
array<Index, NumDims> cudaOutputDimensions;
|
||||
array<Index, NumDims> gpuInputDimensions;
|
||||
array<Index, NumDims> gpuOutputDimensions;
|
||||
array<Index, NumDims> tmp = dimensions;
|
||||
array<Index, NumDims> ordering;
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
@ -65,8 +65,8 @@ class IndexMapper {
|
||||
const Index index = i + offset;
|
||||
ordering[index] = indices[i];
|
||||
tmp[indices[i]] = -1;
|
||||
cudaInputDimensions[index] = input_dims[indices[i]];
|
||||
cudaOutputDimensions[index] = dimensions[indices[i]];
|
||||
gpuInputDimensions[index] = input_dims[indices[i]];
|
||||
gpuOutputDimensions[index] = dimensions[indices[i]];
|
||||
}
|
||||
|
||||
int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
@ -75,8 +75,8 @@ class IndexMapper {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (tmp[i] >= 0) {
|
||||
ordering[written] = i;
|
||||
cudaInputDimensions[written] = input_dims[i];
|
||||
cudaOutputDimensions[written] = dimensions[i];
|
||||
gpuInputDimensions[written] = input_dims[i];
|
||||
gpuOutputDimensions[written] = dimensions[i];
|
||||
++written;
|
||||
}
|
||||
}
|
||||
@ -89,37 +89,37 @@ class IndexMapper {
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
if (i > NumKernelDims) {
|
||||
m_cudaInputStrides[i] =
|
||||
m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
|
||||
m_cudaOutputStrides[i] =
|
||||
m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
|
||||
m_gpuInputStrides[i] =
|
||||
m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
|
||||
m_gpuOutputStrides[i] =
|
||||
m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
|
||||
} else {
|
||||
m_cudaInputStrides[i] = 1;
|
||||
m_cudaOutputStrides[i] = 1;
|
||||
m_gpuInputStrides[i] = 1;
|
||||
m_gpuOutputStrides[i] = 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i) {
|
||||
if (static_cast<size_t>(i + 1) < offset) {
|
||||
m_cudaInputStrides[i] =
|
||||
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
|
||||
m_cudaOutputStrides[i] =
|
||||
m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
|
||||
m_gpuInputStrides[i] =
|
||||
m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
|
||||
m_gpuOutputStrides[i] =
|
||||
m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
|
||||
} else {
|
||||
m_cudaInputStrides[i] = 1;
|
||||
m_cudaOutputStrides[i] = 1;
|
||||
m_gpuInputStrides[i] = 1;
|
||||
m_gpuOutputStrides[i] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int d = NumDims - 1; d > NumKernelDims; --d) {
|
||||
const Index idx = p / m_cudaInputStrides[d];
|
||||
const Index idx = p / m_gpuInputStrides[d];
|
||||
inputIndex += idx * m_inputStrides[d];
|
||||
p -= idx * m_cudaInputStrides[d];
|
||||
p -= idx * m_gpuInputStrides[d];
|
||||
}
|
||||
inputIndex += p * m_inputStrides[NumKernelDims];
|
||||
} else {
|
||||
@ -128,22 +128,22 @@ class IndexMapper {
|
||||
limit = NumDims - NumKernelDims - 1;
|
||||
}
|
||||
for (int d = 0; d < limit; ++d) {
|
||||
const Index idx = p / m_cudaInputStrides[d];
|
||||
const Index idx = p / m_gpuInputStrides[d];
|
||||
inputIndex += idx * m_inputStrides[d];
|
||||
p -= idx * m_cudaInputStrides[d];
|
||||
p -= idx * m_gpuInputStrides[d];
|
||||
}
|
||||
inputIndex += p * m_inputStrides[limit];
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
|
||||
Index outputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
for (int d = NumDims - 1; d > NumKernelDims; --d) {
|
||||
const Index idx = p / m_cudaOutputStrides[d];
|
||||
const Index idx = p / m_gpuOutputStrides[d];
|
||||
outputIndex += idx * m_outputStrides[d];
|
||||
p -= idx * m_cudaOutputStrides[d];
|
||||
p -= idx * m_gpuOutputStrides[d];
|
||||
}
|
||||
outputIndex += p * m_outputStrides[NumKernelDims];
|
||||
} else {
|
||||
@ -152,44 +152,44 @@ class IndexMapper {
|
||||
limit = NumDims - NumKernelDims - 1;
|
||||
}
|
||||
for (int d = 0; d < limit; ++d) {
|
||||
const Index idx = p / m_cudaOutputStrides[d];
|
||||
const Index idx = p / m_gpuOutputStrides[d];
|
||||
outputIndex += idx * m_outputStrides[d];
|
||||
p -= idx * m_cudaOutputStrides[d];
|
||||
p -= idx * m_gpuOutputStrides[d];
|
||||
}
|
||||
outputIndex += p * m_outputStrides[limit];
|
||||
}
|
||||
return outputIndex;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumDims - NumKernelDims;
|
||||
return i * m_inputStrides[offset];
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumDims - NumKernelDims;
|
||||
return i * m_outputStrides[offset];
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumDims - NumKernelDims;
|
||||
return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumDims - NumKernelDims;
|
||||
return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumDims - NumKernelDims;
|
||||
@ -197,7 +197,7 @@ class IndexMapper {
|
||||
k * m_inputStrides[offset + 2];
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
|
||||
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
|
||||
? 0
|
||||
: NumDims - NumKernelDims;
|
||||
@ -209,8 +209,8 @@ class IndexMapper {
|
||||
static const int NumDims = internal::array_size<InputDims>::value;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_cudaInputStrides;
|
||||
array<Index, NumDims> m_cudaOutputStrides;
|
||||
array<Index, NumDims> m_gpuInputStrides;
|
||||
array<Index, NumDims> m_gpuOutputStrides;
|
||||
};
|
||||
|
||||
|
||||
@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
|
||||
|
||||
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
|
||||
|
||||
template <int StaticKernelSize>
|
||||
struct GetKernelSize {
|
||||
@ -576,8 +576,12 @@ __global__ void EigenConvolutionKernel1D(
|
||||
indexMapper,
|
||||
const float* __restrict kernel, const int numPlanes, const int numX,
|
||||
const int maxX, const int kernelSize, float* buffer) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
#else
|
||||
extern __shared__ float s[];
|
||||
|
||||
#endif
|
||||
|
||||
const int first_x = blockIdx.x * maxX;
|
||||
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
|
||||
@ -588,18 +592,18 @@ __global__ void EigenConvolutionKernel1D(
|
||||
|
||||
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
|
||||
// Load inputs to shared memory
|
||||
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
|
||||
const int plane_kernel_offset = threadIdx.y * num_x_input;
|
||||
#pragma unroll
|
||||
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
|
||||
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Compute the convolution
|
||||
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
|
||||
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||
@ -609,7 +613,7 @@ __global__ void EigenConvolutionKernel1D(
|
||||
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
|
||||
result += s[k + kernel_offset] * kernel[k];
|
||||
}
|
||||
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
|
||||
const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
|
||||
buffer[tensor_index] = result;
|
||||
}
|
||||
__syncthreads();
|
||||
@ -625,7 +629,11 @@ __global__ void EigenConvolutionKernel2D(
|
||||
const float* __restrict kernel, const int numPlanes, const int numX,
|
||||
const int maxX, const int numY, const int maxY, const int kernelSizeX,
|
||||
const int kernelSizeY, float* buffer) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
#else
|
||||
extern __shared__ float s[];
|
||||
#endif
|
||||
|
||||
const int first_x = blockIdx.x * maxX;
|
||||
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||
@ -642,7 +650,7 @@ __global__ void EigenConvolutionKernel2D(
|
||||
|
||||
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
|
||||
|
||||
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
|
||||
const int plane_kernel_offset = threadIdx.z * num_y_input;
|
||||
|
||||
// Load inputs to shared memory
|
||||
@ -651,7 +659,7 @@ __global__ void EigenConvolutionKernel2D(
|
||||
const int input_offset = num_x_input * (j + plane_kernel_offset);
|
||||
#pragma unroll
|
||||
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
|
||||
s[i + input_offset] = eval.coeff(tensor_index);
|
||||
}
|
||||
}
|
||||
@ -659,7 +667,7 @@ __global__ void EigenConvolutionKernel2D(
|
||||
__syncthreads();
|
||||
|
||||
// Convolution
|
||||
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
|
||||
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||
@ -675,7 +683,7 @@ __global__ void EigenConvolutionKernel2D(
|
||||
result += s[k + input_offset] * kernel[k + kernel_offset];
|
||||
}
|
||||
}
|
||||
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
|
||||
const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
|
||||
buffer[tensor_index] = result;
|
||||
}
|
||||
}
|
||||
@ -693,7 +701,11 @@ __global__ void EigenConvolutionKernel3D(
|
||||
const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
|
||||
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
|
||||
const size_t kernelSizeZ, float* buffer) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
HIP_DYNAMIC_SHARED(float, s)
|
||||
#else
|
||||
extern __shared__ float s[];
|
||||
#endif
|
||||
|
||||
// Load inputs to shared memory
|
||||
const int first_x = blockIdx.x * maxX;
|
||||
@ -710,13 +722,13 @@ __global__ void EigenConvolutionKernel3D(
|
||||
|
||||
for (int p = 0; p < numPlanes; ++p) {
|
||||
|
||||
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
|
||||
const int plane_kernel_offset = 0;
|
||||
|
||||
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
|
||||
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
|
||||
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
|
||||
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
|
||||
}
|
||||
}
|
||||
@ -728,7 +740,7 @@ __global__ void EigenConvolutionKernel3D(
|
||||
const int num_z_output = last_z - first_z + 1;
|
||||
const int num_y_output = last_y - first_y + 1;
|
||||
const int num_x_output = last_x - first_x + 1;
|
||||
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
|
||||
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
|
||||
|
||||
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
|
||||
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||
@ -741,7 +753,7 @@ __global__ void EigenConvolutionKernel3D(
|
||||
}
|
||||
}
|
||||
}
|
||||
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
|
||||
const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
|
||||
buffer[tensor_index] = result;
|
||||
}
|
||||
}
|
||||
@ -854,9 +866,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
|
||||
|
||||
const int maxSharedMem = m_device.sharedMemPerBlock();
|
||||
const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
|
||||
const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
|
||||
const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
|
||||
const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
|
||||
const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
|
||||
const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
|
||||
const int warpSize = 32;
|
||||
|
||||
switch (NumKernelDims) {
|
||||
@ -908,15 +920,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
m_inputImpl.dimensions(), kernel_dims, indices);
|
||||
switch(kernel_size) {
|
||||
case 4: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
|
||||
break;
|
||||
}
|
||||
case 7: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -969,11 +981,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
case 4: {
|
||||
switch (kernel_size_y) {
|
||||
case 7: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -982,18 +994,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
case 7: {
|
||||
switch (kernel_size_y) {
|
||||
case 4: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1039,7 +1051,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
||||
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
|
||||
m_inputImpl.dimensions(), kernel_dims, indices);
|
||||
|
||||
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
|
||||
LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -35,9 +35,12 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Running on the host CPU
|
||||
return 1;
|
||||
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// Running on a HIP device
|
||||
return 64;
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return 32;
|
||||
@ -45,9 +48,12 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Running on the host CPU
|
||||
return l1CacheSize();
|
||||
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// Running on a HIP device
|
||||
return 48*1024; // FIXME : update this number for HIP
|
||||
#else
|
||||
// Running on a CUDA device, return the amount of shared memory available.
|
||||
return 48*1024;
|
||||
@ -55,9 +61,12 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
// Running single threaded on the host CPU
|
||||
return l3CacheSize();
|
||||
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// Running on a HIP device
|
||||
return firstLevelCacheSize(); // FIXME : update this number for HIP
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return firstLevelCacheSize();
|
||||
@ -65,10 +74,14 @@ struct DefaultDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// Running single threaded on the host CPU
|
||||
// Should return an enum that encodes the ISA supported by the CPU
|
||||
return 1;
|
||||
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// Running on a HIP device
|
||||
// return 1 as major for HIP
|
||||
return 1;
|
||||
#else
|
||||
// Running on a CUDA device
|
||||
return EIGEN_CUDA_ARCH / 100;
|
||||
|
@ -7,21 +7,26 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
|
||||
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
|
||||
|
||||
// This header file container defines fo gpu* macros which will resolve to
|
||||
// their equivalent hip* or cuda* versions depending on the compiler in use
|
||||
// A separte header (included at the end of this file) will undefine all
|
||||
#include "TensorGpuHipCudaDefines.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
static const int kCudaScratchSize = 1024;
|
||||
static const int kGpuScratchSize = 1024;
|
||||
|
||||
// This defines an interface that GPUDevice can take to use
|
||||
// CUDA streams underneath.
|
||||
// HIP / CUDA streams underneath.
|
||||
class StreamInterface {
|
||||
public:
|
||||
virtual ~StreamInterface() {}
|
||||
|
||||
virtual const cudaStream_t& stream() const = 0;
|
||||
virtual const cudaDeviceProp& deviceProperties() const = 0;
|
||||
virtual const gpuStream_t& stream() const = 0;
|
||||
virtual const gpuDeviceProp_t& deviceProperties() const = 0;
|
||||
|
||||
// Allocate memory on the actual device where the computation will run
|
||||
virtual void* allocate(size_t num_bytes) const = 0;
|
||||
@ -37,7 +42,7 @@ class StreamInterface {
|
||||
virtual unsigned int* semaphore() const = 0;
|
||||
};
|
||||
|
||||
static cudaDeviceProp* m_deviceProperties;
|
||||
static gpuDeviceProp_t* m_deviceProperties;
|
||||
static bool m_devicePropInitialized = false;
|
||||
|
||||
static void initializeDeviceProp() {
|
||||
@ -58,23 +63,23 @@ static void initializeDeviceProp() {
|
||||
#endif
|
||||
// We're the first thread to reach this point.
|
||||
int num_devices;
|
||||
cudaError_t status = cudaGetDeviceCount(&num_devices);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to get the number of CUDA devices: "
|
||||
<< cudaGetErrorString(status)
|
||||
gpuError_t status = gpuGetDeviceCount(&num_devices);
|
||||
if (status != gpuSuccess) {
|
||||
std::cerr << "Failed to get the number of GPU devices: "
|
||||
<< gpuGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == cudaSuccess);
|
||||
assert(status == gpuSuccess);
|
||||
}
|
||||
m_deviceProperties = new cudaDeviceProp[num_devices];
|
||||
m_deviceProperties = new gpuDeviceProp_t[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to initialize CUDA device #"
|
||||
status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
|
||||
if (status != gpuSuccess) {
|
||||
std::cerr << "Failed to initialize GPU device #"
|
||||
<< i
|
||||
<< ": "
|
||||
<< cudaGetErrorString(status)
|
||||
<< gpuGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == cudaSuccess);
|
||||
assert(status == gpuSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
@ -94,87 +99,87 @@ static void initializeDeviceProp() {
|
||||
}
|
||||
}
|
||||
|
||||
static const cudaStream_t default_stream = cudaStreamDefault;
|
||||
static const gpuStream_t default_stream = gpuStreamDefault;
|
||||
|
||||
class CudaStreamDevice : public StreamInterface {
|
||||
class GpuStreamDevice : public StreamInterface {
|
||||
public:
|
||||
// Use the default stream on the current device
|
||||
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
|
||||
cudaGetDevice(&device_);
|
||||
GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
|
||||
gpuGetDevice(&device_);
|
||||
initializeDeviceProp();
|
||||
}
|
||||
// Use the default stream on the specified device
|
||||
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
initializeDeviceProp();
|
||||
}
|
||||
// Use the specified stream. Note that it's the
|
||||
// caller responsibility to ensure that the stream can run on
|
||||
// the specified device. If no device is specified the code
|
||||
// assumes that the stream is associated to the current gpu device.
|
||||
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
|
||||
GpuStreamDevice(const gpuStream_t* stream, int device = -1)
|
||||
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
if (device < 0) {
|
||||
cudaGetDevice(&device_);
|
||||
gpuGetDevice(&device_);
|
||||
} else {
|
||||
int num_devices;
|
||||
cudaError_t err = cudaGetDeviceCount(&num_devices);
|
||||
gpuError_t err = gpuGetDeviceCount(&num_devices);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
assert(device < num_devices);
|
||||
device_ = device;
|
||||
}
|
||||
initializeDeviceProp();
|
||||
}
|
||||
|
||||
virtual ~CudaStreamDevice() {
|
||||
virtual ~GpuStreamDevice() {
|
||||
if (scratch_) {
|
||||
deallocate(scratch_);
|
||||
}
|
||||
}
|
||||
|
||||
const cudaStream_t& stream() const { return *stream_; }
|
||||
const cudaDeviceProp& deviceProperties() const {
|
||||
const gpuStream_t& stream() const { return *stream_; }
|
||||
const gpuDeviceProp_t& deviceProperties() const {
|
||||
return m_deviceProperties[device_];
|
||||
}
|
||||
virtual void* allocate(size_t num_bytes) const {
|
||||
cudaError_t err = cudaSetDevice(device_);
|
||||
gpuError_t err = gpuSetDevice(device_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
void* result;
|
||||
err = cudaMalloc(&result, num_bytes);
|
||||
assert(err == cudaSuccess);
|
||||
err = gpuMalloc(&result, num_bytes);
|
||||
assert(err == gpuSuccess);
|
||||
assert(result != NULL);
|
||||
return result;
|
||||
}
|
||||
virtual void deallocate(void* buffer) const {
|
||||
cudaError_t err = cudaSetDevice(device_);
|
||||
gpuError_t err = gpuSetDevice(device_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
assert(buffer != NULL);
|
||||
err = cudaFree(buffer);
|
||||
assert(err == cudaSuccess);
|
||||
err = gpuFree(buffer);
|
||||
assert(err == gpuSuccess);
|
||||
}
|
||||
|
||||
virtual void* scratchpad() const {
|
||||
if (scratch_ == NULL) {
|
||||
scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
|
||||
scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
|
||||
}
|
||||
return scratch_;
|
||||
}
|
||||
|
||||
virtual unsigned int* semaphore() const {
|
||||
if (semaphore_ == NULL) {
|
||||
char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
|
||||
char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
|
||||
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
|
||||
cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
||||
gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
}
|
||||
return semaphore_;
|
||||
}
|
||||
|
||||
private:
|
||||
const cudaStream_t* stream_;
|
||||
const gpuStream_t* stream_;
|
||||
int device_;
|
||||
mutable void* scratch_;
|
||||
mutable unsigned int* semaphore_;
|
||||
@ -190,7 +195,7 @@ struct GpuDevice {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
// TODO(bsteiner): This is an internal API, we should not expose it.
|
||||
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
|
||||
EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
|
||||
return stream_->stream();
|
||||
}
|
||||
|
||||
@ -211,11 +216,11 @@ struct GpuDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
|
||||
stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(dst);
|
||||
EIGEN_UNUSED_VARIABLE(src);
|
||||
@ -225,24 +230,24 @@ struct GpuDevice {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
cudaError_t err =
|
||||
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
|
||||
gpuError_t err =
|
||||
gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
cudaError_t err =
|
||||
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
|
||||
gpuError_t err =
|
||||
gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
@ -260,31 +265,31 @@ struct GpuDevice {
|
||||
|
||||
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
// We won't try to take advantage of the l2 cache for the time being, and
|
||||
// there is no l3 cache on cuda devices.
|
||||
// there is no l3 cache on hip/cuda devices.
|
||||
return firstLevelCacheSize();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
||||
#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH)
|
||||
cudaError_t err = cudaStreamSynchronize(stream_->stream());
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Error detected in CUDA stream: "
|
||||
<< cudaGetErrorString(err)
|
||||
#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
gpuError_t err = gpuStreamSynchronize(stream_->stream());
|
||||
if (err != gpuSuccess) {
|
||||
std::cerr << "Error detected in GPU stream: "
|
||||
<< gpuGetErrorString(err)
|
||||
<< std::endl;
|
||||
assert(err == cudaSuccess);
|
||||
assert(err == gpuSuccess);
|
||||
}
|
||||
#else
|
||||
assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
|
||||
EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
|
||||
return stream_->deviceProperties().multiProcessorCount;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
|
||||
EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
|
||||
return stream_->deviceProperties().maxThreadsPerBlock;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
|
||||
EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
|
||||
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
@ -301,12 +306,12 @@ struct GpuDevice {
|
||||
return max_blocks_;
|
||||
}
|
||||
|
||||
// This function checks if the CUDA runtime recorded an error for the
|
||||
// This function checks if the GPU runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
inline bool ok() const {
|
||||
#ifdef EIGEN_CUDACC
|
||||
cudaError_t error = cudaStreamQuery(stream_->stream());
|
||||
return (error == cudaSuccess) || (error == cudaErrorNotReady);
|
||||
#ifdef EIGEN_GPUCC
|
||||
gpuError_t error = gpuStreamQuery(stream_->stream());
|
||||
return (error == gpuSuccess) || (error == gpuErrorNotReady);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@ -317,18 +322,27 @@ struct GpuDevice {
|
||||
int max_blocks_;
|
||||
};
|
||||
|
||||
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||
#if defined(EIGEN_HIPCC)
|
||||
|
||||
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||
hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
|
||||
assert(hipGetLastError() == hipSuccess);
|
||||
|
||||
#else
|
||||
|
||||
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
|
||||
assert(cudaGetLastError() == cudaSuccess);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
#ifdef EIGEN_CUDACC
|
||||
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
|
||||
#ifndef EIGEN_CUDA_ARCH
|
||||
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
|
||||
#ifdef EIGEN_GPUCC
|
||||
static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
gpuError_t status = gpuDeviceSetSharedMemConfig(config);
|
||||
EIGEN_UNUSED_VARIABLE(status)
|
||||
assert(status == cudaSuccess);
|
||||
assert(status == gpuSuccess);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(config)
|
||||
#endif
|
||||
@ -337,4 +351,7 @@ static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
|
||||
// undefine all the gpu* macros we defined at the beginning of the file
|
||||
#include "TensorGpuHipCudaUndefines.h"
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
|
@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
||||
};
|
||||
|
||||
|
||||
#if defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||
struct EigenMetaKernelEval {
|
||||
static __device__ EIGEN_ALWAYS_INLINE
|
||||
@ -250,21 +250,22 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
const int block_size = device.maxCudaThreadsPerBlock();
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
|
||||
const int block_size = device.maxGpuThreadsPerBlock();
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
||||
|
||||
LAUNCH_CUDA_KERNEL(
|
||||
LAUNCH_GPU_KERNEL(
|
||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
|
||||
num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
|
||||
#endif // EIGEN_CUDACC
|
||||
#endif // EIGEN_GPUCC
|
||||
#endif // EIGEN_USE_GPU
|
||||
|
||||
// SYCL Executor policy
|
||||
|
@ -109,7 +109,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
|
||||
const Index numValues = internal::array_prod(m_impl.dimensions());
|
||||
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
|
||||
// Should initialize the memory in case we're dealing with non POD types.
|
||||
|
87
unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
Normal file
87
unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
Normal file
@ -0,0 +1,87 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
|
||||
#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
|
||||
|
||||
// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
|
||||
// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but
|
||||
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
|
||||
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
|
||||
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
|
||||
// This will obsviously not work when trying to compile tensorflow on a sytem with no CUDA
|
||||
// To work around this issue for HIP systems (and leave the default behaviour intact), the
|
||||
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
|
||||
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
|
||||
// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
|
||||
|
||||
#if defined(EIGEN_USE_HIP)
|
||||
|
||||
#define gpuStream_t hipStream_t
|
||||
#define gpuDeviceProp_t hipDeviceProp_t
|
||||
#define gpuError_t hipError_t
|
||||
#define gpuSuccess hipSuccess
|
||||
#define gpuErrorNotReady hipErrorNotReady
|
||||
#define gpuGetDeviceCount hipGetDeviceCount
|
||||
#define gpuGetErrorString hipGetErrorString
|
||||
#define gpuGetDeviceProperties hipGetDeviceProperties
|
||||
#define gpuStreamDefault hipStreamDefault
|
||||
#define gpuGetDevice hipGetDevice
|
||||
#define gpuSetDevice hipSetDevice
|
||||
#define gpuMalloc hipMalloc
|
||||
#define gpuFree hipFree
|
||||
#define gpuMemsetAsync hipMemsetAsync
|
||||
#define gpuMemcpyAsync hipMemcpyAsync
|
||||
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define gpuStreamQuery hipStreamQuery
|
||||
#define gpuSharedMemConfig hipSharedMemConfig
|
||||
#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
|
||||
#define gpuStreamSynchronize hipStreamSynchronize
|
||||
#define gpuDeviceSynchronize hipDeviceSynchronize
|
||||
#define gpuMemcpy hipMemcpy
|
||||
|
||||
#else
|
||||
|
||||
#define gpuStream_t cudaStream_t
|
||||
#define gpuDeviceProp_t cudaDeviceProp
|
||||
#define gpuError_t cudaError_t
|
||||
#define gpuSuccess cudaSuccess
|
||||
#define gpuErrorNotReady cudaErrorNotReady
|
||||
#define gpuGetDeviceCount cudaGetDeviceCount
|
||||
#define gpuGetErrorString cudaGetErrorString
|
||||
#define gpuGetDeviceProperties cudaGetDeviceProperties
|
||||
#define gpuStreamDefault cudaStreamDefault
|
||||
#define gpuGetDevice cudaGetDevice
|
||||
#define gpuSetDevice cudaSetDevice
|
||||
#define gpuMalloc cudaMalloc
|
||||
#define gpuFree cudaFree
|
||||
#define gpuMemsetAsync cudaMemsetAsync
|
||||
#define gpuMemcpyAsync cudaMemcpyAsync
|
||||
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
|
||||
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
|
||||
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
|
||||
#define gpuStreamQuery cudaStreamQuery
|
||||
#define gpuSharedMemConfig cudaSharedMemConfig
|
||||
#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
|
||||
#define gpuStreamSynchronize cudaStreamSynchronize
|
||||
#define gpuDeviceSynchronize cudaDeviceSynchronize
|
||||
#define gpuMemcpy cudaMemcpy
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// HIPCC does not support the use of assert on the GPU side.
|
||||
#undef assert
|
||||
#define assert(COND)
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
|
@ -0,0 +1,40 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
|
||||
|
||||
#undef gpuStream_t
|
||||
#undef gpuDeviceProp_t
|
||||
#undef gpuError_t
|
||||
#undef gpuSuccess
|
||||
#undef gpuErrorNotReady
|
||||
#undef gpuGetDeviceCount
|
||||
#undef gpuGetErrorString
|
||||
#undef gpuGetDeviceProperties
|
||||
#undef gpuStreamDefault
|
||||
#undef gpuGetDevice
|
||||
#undef gpuSetDevice
|
||||
#undef gpuMalloc
|
||||
#undef gpuFree
|
||||
#undef gpuMemsetAsync
|
||||
#undef gpuMemcpyAsync
|
||||
#undef gpuMemcpyDeviceToDevice
|
||||
#undef gpuMemcpyDeviceToHost
|
||||
#undef gpuMemcpyHostToDevice
|
||||
#undef gpuStreamQuery
|
||||
#undef gpuSharedMemConfig
|
||||
#undef gpuDeviceSetSharedMemConfig
|
||||
#undef gpuStreamSynchronize
|
||||
#undef gpuDeviceSynchronize
|
||||
#undef gpuMemcpy
|
||||
|
||||
#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
|
@ -350,7 +350,8 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
|
||||
template<typename FirstType, typename... OtherTypes>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
|
||||
size_t result = 1;
|
||||
for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
|
||||
result *= sizes[i];
|
||||
|
@ -35,7 +35,7 @@ namespace {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||
return __clz(val);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::clz(val);
|
||||
@ -53,7 +53,7 @@ namespace {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||
return __clzll(val);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::clz(val);
|
||||
@ -90,7 +90,7 @@ namespace {
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __umulhi(a, b);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
||||
@ -101,7 +101,7 @@ namespace {
|
||||
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||
#if defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return __umul64hi(a, b);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
||||
@ -124,7 +124,7 @@ namespace {
|
||||
template <typename T>
|
||||
struct DividerHelper<64, T> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
||||
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
||||
#else
|
||||
const uint64_t shift = 1ULL << log_div;
|
||||
@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||
return (__umulhi(magic, n) >> shift);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
||||
|
@ -27,7 +27,7 @@
|
||||
*/
|
||||
|
||||
// SFINAE requires variadic templates
|
||||
#ifndef EIGEN_CUDACC
|
||||
#if !defined(EIGEN_GPUCC)
|
||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
// SFINAE doesn't work for gcc <= 4.7
|
||||
#ifdef EIGEN_COMP_GNUC
|
||||
|
@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
|
||||
};
|
||||
|
||||
// For CUDA packet types when using a GpuDevice
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
|
||||
template <>
|
||||
struct PacketType<half, GpuDevice> {
|
||||
typedef half2 type;
|
||||
|
@ -858,8 +858,8 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
return numext::maxi(min, numext::mini(max,value));
|
||||
#else
|
||||
|
@ -16,7 +16,7 @@ namespace internal {
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
||||
#ifdef EIGEN_CUDA_ARCH
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
// We don't support 3d kernels since we currently only use 1 and
|
||||
// 2d kernels.
|
||||
assert(threadIdx.z == 0);
|
||||
|
@ -334,12 +334,12 @@ struct OuterReducer {
|
||||
};
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||
template <int B, int N, typename S, typename R, typename I>
|
||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
template <typename S, typename R, typename I>
|
||||
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||
template <int B, int N, typename S, typename R, typename I>
|
||||
@ -495,7 +495,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
|
||||
EIGEN_STRONG_INLINE
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
// Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same for all the functions
|
||||
// being called within here, which then leads to proliferation of EIGEN_DEVICE_FUNC markings, one
|
||||
// of which will eventually result in an NVCC error
|
||||
EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
|
||||
m_impl.evalSubExprsIfNeeded(NULL);
|
||||
|
||||
// Use the FullReducer if possible.
|
||||
@ -694,9 +701,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
#ifdef EIGEN_USE_THREADS
|
||||
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
||||
#endif
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||
template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
|
||||
@ -781,7 +788,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
||||
Op m_reducer;
|
||||
|
||||
// For full reductions
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
||||
static const bool RunningOnSycl = false;
|
||||
#elif defined(EIGEN_USE_SYCL)
|
||||
|
@ -7,23 +7,23 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
|
||||
// Full reducers for GPU, don't vectorize for now
|
||||
|
||||
// Reducer function that enables multiple cuda thread to safely accumulate at the same
|
||||
// Reducer function that enables multiple gpu thread to safely accumulate at the same
|
||||
// output address. It basically reads the current value of the output variable, and
|
||||
// attempts to update it with the new value. If in the meantime another cuda thread
|
||||
// attempts to update it with the new value. If in the meantime another gpu thread
|
||||
// updated the content of the output address it will try again.
|
||||
template <typename T, typename R>
|
||||
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
if (sizeof(T) == 4)
|
||||
{
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
@ -79,7 +79,7 @@ __device__ inline double atomicExchCustom(double* address, double val) {
|
||||
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <template <typename T> class R>
|
||||
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
@ -98,11 +98,11 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
atomicAdd(output, accum);
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
|
||||
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
// Initialize the output value
|
||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
|
||||
if (gridDim.x == 1) {
|
||||
@ -168,7 +168,16 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// use std::is_floating_point to determine the type of reduced_val
|
||||
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
|
||||
// and list the float and int versions of __shfl_down as the candidate functions.
|
||||
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
|
||||
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
|
||||
}
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||
#else
|
||||
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
|
||||
@ -182,6 +191,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
||||
if (gridDim.x > 1 && threadIdx.x == 0) {
|
||||
// Let the last block reset the semaphore
|
||||
atomicInc(semaphore, gridDim.x + 1);
|
||||
#if defined(EIGEN_HIPCC)
|
||||
__threadfence_system();
|
||||
#endif
|
||||
}
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
@ -189,7 +201,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
||||
}
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
|
||||
@ -227,6 +239,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
|
||||
|
||||
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
||||
|
||||
if (gridDim.x == 1) {
|
||||
if (first_index == 0) {
|
||||
if (num_coeffs % 2 != 0) {
|
||||
@ -238,7 +251,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
half2 accum = reducer.template initializePacket<half2>();
|
||||
const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
|
||||
for (Index i = 0; i < max_iter; i += BlockSize) {
|
||||
@ -250,7 +263,13 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
wka_in.h = accum;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &accum);
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
||||
#else
|
||||
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
|
||||
@ -280,7 +299,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
|
||||
*output = tmp;
|
||||
}
|
||||
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct FullReductionLauncher {
|
||||
@ -298,6 +317,7 @@ struct FullReductionLauncher<
|
||||
internal::is_same<double, OutputType>::value,
|
||||
void>::type> {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
|
||||
|
||||
typedef typename Self::Index Index;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
@ -308,12 +328,12 @@ struct FullReductionLauncher<
|
||||
semaphore = device.semaphore();
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
|
||||
@ -334,20 +354,20 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
|
||||
1, 1, 0, device, reducer, self, num_coeffs, scratch);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
|
||||
LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
|
||||
1, 1, 0, device, reducer, output, scratch);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
@ -355,16 +375,16 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple cases
|
||||
// of doubles, floats and half floats
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else // EIGEN_HAS_CUDA_FP16
|
||||
#else // EIGEN_HAS_GPU_FP16
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename OutputType>
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
||||
@ -384,7 +404,7 @@ template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
#if EIGEN_CUDA_ARCH >= 300
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
typedef typename Self::CoeffReturnType Type;
|
||||
eigen_assert(blockDim.y == 1);
|
||||
eigen_assert(blockDim.z == 1);
|
||||
@ -437,7 +457,16 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// use std::is_floating_point to determine the type of reduced_val
|
||||
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
|
||||
// and list the float and int versions of __shfl_down as the candidate functions.
|
||||
if (std::is_floating_point<Type>::value) {
|
||||
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
|
||||
}
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
||||
#else
|
||||
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
|
||||
@ -454,7 +483,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
@ -531,7 +560,18 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
|
||||
wka_in.h = reduced_val1;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val1);
|
||||
|
||||
wka_in.h = reduced_val2;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val2);
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
||||
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
||||
#else
|
||||
@ -556,7 +596,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct InnerReductionLauncher {
|
||||
@ -581,30 +621,30 @@ struct InnerReductionLauncher<
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / 1024;
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
|
||||
LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
|
||||
num_blocks, 1024, 0, device, reducer.initialize(),
|
||||
num_preserved_vals, output);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
|
||||
@ -627,28 +667,28 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
const int block_size = /*256*/128;
|
||||
const int num_per_thread = /*128*/64;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / 1024;
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
|
||||
1, 1, 0, device, reducer, self, num_preserved_vals, output);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
|
||||
template <typename Self, typename Op>
|
||||
@ -656,16 +696,16 @@ struct InnerReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple case
|
||||
// of floats and half floats.
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else // EIGEN_HAS_CUDA_FP16
|
||||
#else // EIGEN_HAS_GPU_FP16
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif // EIGEN_HAS_CUDA_FP16
|
||||
#endif // EIGEN_HAS_GPU_FP16
|
||||
|
||||
template <typename OutputType>
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
@ -723,7 +763,20 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
template <typename Device, typename OutputType>
|
||||
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||
static
|
||||
#if !defined(EIGEN_HIPCC)
|
||||
// FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
|
||||
// (in the cxx11_tensor_reduction_gpu test)
|
||||
//
|
||||
// terminate called after throwing an instance of 'std::runtime_error'
|
||||
// what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
|
||||
//
|
||||
// dont know why this happens (and why is it a runtime error instead of a compile time errror)
|
||||
//
|
||||
// this will be fixed by HIP PR#457
|
||||
EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should only be called to reduce doubles or floats on a gpu device");
|
||||
return true;
|
||||
}
|
||||
@ -740,33 +793,33 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 16;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs in the reduction kernel itself when we don't have to worry
|
||||
// about race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||
device.maxCudaThreadsPerMultiProcessor() / 1024;
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
|
||||
LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
|
||||
num_blocks, 1024, 0, device, reducer.initialize(),
|
||||
num_preserved_vals, output);
|
||||
}
|
||||
|
||||
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
|
@ -242,7 +242,7 @@ struct ScanLauncher {
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
|
||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||
|
||||
// GPU implementation of scan
|
||||
// TODO(ibab) This placeholder implementation performs multiple scans in
|
||||
@ -278,10 +278,11 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
|
||||
Index total_size = internal::array_prod(self.dimensions());
|
||||
Index num_blocks = (total_size / self.size() + 63) / 64;
|
||||
Index block_size = 64;
|
||||
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
|
||||
|
||||
LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
|
||||
}
|
||||
};
|
||||
#endif // EIGEN_USE_GPU && EIGEN_CUDACC
|
||||
#endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
@ -268,7 +268,7 @@ template<
|
||||
typename Reducer
|
||||
> struct reduce<Reducer>
|
||||
{
|
||||
constexpr static inline int run() { return Reducer::Identity; }
|
||||
EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; }
|
||||
};
|
||||
|
||||
template<
|
||||
@ -276,7 +276,7 @@ template<
|
||||
typename A
|
||||
> struct reduce<Reducer, A>
|
||||
{
|
||||
constexpr static inline A run(A a) { return a; }
|
||||
EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; }
|
||||
};
|
||||
|
||||
template<
|
||||
@ -285,7 +285,7 @@ template<
|
||||
typename... Ts
|
||||
> struct reduce<Reducer, A, Ts...>
|
||||
{
|
||||
constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
|
||||
EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
|
||||
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
|
||||
}
|
||||
};
|
||||
@ -324,7 +324,7 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
|
||||
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
|
||||
// does...
|
||||
template<typename... Ts>
|
||||
constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
|
||||
EIGEN_DEVICE_FUNC constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
|
||||
{
|
||||
return reduce<product_op, Ts...>::run(ts...);
|
||||
}
|
||||
|
@ -15,7 +15,7 @@
|
||||
// The array class is only available starting with cxx11. Emulate our own here
|
||||
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
||||
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||
|
||||
namespace Eigen {
|
||||
template <typename T, size_t n> class array {
|
||||
|
@ -53,8 +53,8 @@ namespace Eigen {
|
||||
#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
|
||||
#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
#include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
|
||||
#if defined EIGEN_VECTORIZE_GPU
|
||||
#include "src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h"
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
|
@ -190,7 +190,7 @@ template <>
|
||||
struct lgamma_impl<float> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE float run(float x) {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
int dummy;
|
||||
return ::lgammaf_r(x, &dummy);
|
||||
#else
|
||||
@ -203,7 +203,7 @@ template <>
|
||||
struct lgamma_impl<double> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE double run(double x) {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||
int dummy;
|
||||
return ::lgamma_r(x, &dummy);
|
||||
#else
|
||||
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
|
||||
#define EIGEN_CUDA_SPECIALFUNCTIONS_H
|
||||
#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
|
||||
#define EIGEN_GPU_SPECIALFUNCTIONS_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float4 plgamma<float4>(const float4& a)
|
||||
@ -223,4 +223,4 @@ pi1e<double2>(const double2& x) {
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
|
||||
#endif // EIGEN_GPU_SPECIALFUNCTIONS_H
|
@ -275,26 +275,83 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
|
||||
ei_add_test(cxx11_tensor_complex_cuda)
|
||||
ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
|
||||
ei_add_test(cxx11_tensor_reduction_cuda)
|
||||
ei_add_test(cxx11_tensor_argmax_cuda)
|
||||
ei_add_test(cxx11_tensor_cast_float16_cuda)
|
||||
ei_add_test(cxx11_tensor_scan_cuda)
|
||||
ei_add_test(cxx11_tensor_complex_gpu)
|
||||
ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
|
||||
ei_add_test(cxx11_tensor_reduction_gpu)
|
||||
ei_add_test(cxx11_tensor_argmax_gpu)
|
||||
ei_add_test(cxx11_tensor_cast_float16_gpu)
|
||||
ei_add_test(cxx11_tensor_scan_gpu)
|
||||
|
||||
# Contractions require arch 3.0 or higher
|
||||
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
|
||||
ei_add_test(cxx11_tensor_device)
|
||||
ei_add_test(cxx11_tensor_cuda)
|
||||
ei_add_test(cxx11_tensor_contract_cuda)
|
||||
ei_add_test(cxx11_tensor_of_float16_cuda)
|
||||
ei_add_test(cxx11_tensor_gpu)
|
||||
ei_add_test(cxx11_tensor_contract_gpu)
|
||||
ei_add_test(cxx11_tensor_of_float16_gpu)
|
||||
endif()
|
||||
|
||||
# The random number generation code requires arch 3.5 or greater.
|
||||
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
|
||||
ei_add_test(cxx11_tensor_random_cuda)
|
||||
ei_add_test(cxx11_tensor_random_gpu)
|
||||
endif()
|
||||
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
endif()
|
||||
|
||||
# Add HIP specific tests
|
||||
if (EIGEN_TEST_HIP)
|
||||
|
||||
set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
|
||||
|
||||
if (EXISTS ${HIP_PATH})
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
|
||||
|
||||
find_package(HIP REQUIRED)
|
||||
if (HIP_FOUND)
|
||||
|
||||
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
|
||||
|
||||
if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${HIP_PATH}/include)
|
||||
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
#
|
||||
# complex datatype is not yet supported by HIP
|
||||
# so leaving out those tests for now
|
||||
#
|
||||
# ei_add_test(cxx11_tensor_complex_gpu)
|
||||
# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
|
||||
#
|
||||
ei_add_test(cxx11_tensor_reduction_gpu)
|
||||
ei_add_test(cxx11_tensor_argmax_gpu)
|
||||
ei_add_test(cxx11_tensor_cast_float16_gpu)
|
||||
ei_add_test(cxx11_tensor_scan_gpu)
|
||||
ei_add_test(cxx11_tensor_device)
|
||||
|
||||
ei_add_test(cxx11_tensor_gpu)
|
||||
ei_add_test(cxx11_tensor_contract_gpu)
|
||||
ei_add_test(cxx11_tensor_of_float16_gpu)
|
||||
ei_add_test(cxx11_tensor_random_gpu)
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
|
||||
elseif (${HIP_PLATFORM} STREQUAL "nvcc")
|
||||
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
|
||||
else ()
|
||||
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
|
||||
endif()
|
||||
|
||||
endif(HIP_FOUND)
|
||||
|
||||
else ()
|
||||
|
||||
message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
|
||||
|
||||
endif()
|
||||
|
||||
endif(EIGEN_TEST_HIP)
|
||||
|
||||
|
@ -9,16 +9,18 @@
|
||||
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_gpu
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template <int Layout>
|
||||
void test_cuda_simple_argmax()
|
||||
void test_gpu_simple_argmax()
|
||||
{
|
||||
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
|
||||
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
|
||||
@ -34,13 +36,13 @@ void test_cuda_simple_argmax()
|
||||
double* d_in;
|
||||
DenseIndex* d_out_max;
|
||||
DenseIndex* d_out_min;
|
||||
cudaMalloc((void**)(&d_in), in_bytes);
|
||||
cudaMalloc((void**)(&d_out_max), out_bytes);
|
||||
cudaMalloc((void**)(&d_out_min), out_bytes);
|
||||
gpuMalloc((void**)(&d_in), in_bytes);
|
||||
gpuMalloc((void**)(&d_out_max), out_bytes);
|
||||
gpuMalloc((void**)(&d_out_min), out_bytes);
|
||||
|
||||
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
|
||||
@ -50,20 +52,20 @@ void test_cuda_simple_argmax()
|
||||
gpu_out_max.device(gpu_device) = gpu_in.argmax();
|
||||
gpu_out_min.device(gpu_device) = gpu_in.argmin();
|
||||
|
||||
assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
|
||||
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
|
||||
|
||||
cudaFree(d_in);
|
||||
cudaFree(d_out_max);
|
||||
cudaFree(d_out_min);
|
||||
gpuFree(d_in);
|
||||
gpuFree(d_out_max);
|
||||
gpuFree(d_out_min);
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
void test_cuda_argmax_dim()
|
||||
void test_gpu_argmax_dim()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
std::vector<int> dims;
|
||||
@ -97,12 +99,12 @@ void test_cuda_argmax_dim()
|
||||
|
||||
float* d_in;
|
||||
DenseIndex* d_out;
|
||||
cudaMalloc((void**)(&d_in), in_bytes);
|
||||
cudaMalloc((void**)(&d_out), out_bytes);
|
||||
gpuMalloc((void**)(&d_in), in_bytes);
|
||||
gpuMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
|
||||
@ -110,8 +112,8 @@ void test_cuda_argmax_dim()
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
|
||||
|
||||
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(tensor_arg.size(),
|
||||
size_t(2*3*5*7 / tensor.dimension(dim)));
|
||||
@ -134,25 +136,25 @@ void test_cuda_argmax_dim()
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
|
||||
|
||||
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the last index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
|
||||
}
|
||||
|
||||
cudaFree(d_in);
|
||||
cudaFree(d_out);
|
||||
gpuFree(d_in);
|
||||
gpuFree(d_out);
|
||||
}
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
void test_cuda_argmin_dim()
|
||||
void test_gpu_argmin_dim()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
std::vector<int> dims;
|
||||
@ -186,12 +188,12 @@ void test_cuda_argmin_dim()
|
||||
|
||||
float* d_in;
|
||||
DenseIndex* d_out;
|
||||
cudaMalloc((void**)(&d_in), in_bytes);
|
||||
cudaMalloc((void**)(&d_out), out_bytes);
|
||||
gpuMalloc((void**)(&d_in), in_bytes);
|
||||
gpuMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
|
||||
@ -199,8 +201,8 @@ void test_cuda_argmin_dim()
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
|
||||
|
||||
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(tensor_arg.size(),
|
||||
2*3*5*7 / tensor.dimension(dim));
|
||||
@ -223,29 +225,29 @@ void test_cuda_argmin_dim()
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
|
||||
|
||||
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the last index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
|
||||
}
|
||||
|
||||
cudaFree(d_in);
|
||||
cudaFree(d_out);
|
||||
gpuFree(d_in);
|
||||
gpuFree(d_out);
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_cuda()
|
||||
void test_cxx11_tensor_gpu()
|
||||
{
|
||||
CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
|
||||
CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
|
||||
CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
|
||||
CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
|
||||
CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
|
||||
CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
|
||||
CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
|
||||
CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
|
||||
CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
|
||||
CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
|
||||
CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
|
||||
CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
|
||||
}
|
@ -9,7 +9,7 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_gpu
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -18,8 +18,8 @@
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
void test_cuda_conversion() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_conversion() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -72,8 +72,8 @@ void test_fallback_conversion() {
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_cast_float16_cuda()
|
||||
void test_cxx11_tensor_cast_float16_gpu()
|
||||
{
|
||||
CALL_SUBTEST(test_cuda_conversion());
|
||||
CALL_SUBTEST(test_gpu_conversion());
|
||||
CALL_SUBTEST(test_fallback_conversion());
|
||||
}
|
@ -28,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
|
||||
cudaMalloc((void**)(&d_in2), complex_bytes);
|
||||
cudaMalloc((void**)(&d_out), complex_bytes);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
|
@ -34,7 +34,7 @@ void test_cuda_nullary() {
|
||||
cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
|
||||
@ -70,7 +70,7 @@ void test_cuda_nullary() {
|
||||
|
||||
static void test_cuda_sum_reductions() {
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
||||
@ -106,7 +106,7 @@ static void test_cuda_sum_reductions() {
|
||||
|
||||
static void test_cuda_mean_reductions() {
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
||||
@ -142,7 +142,7 @@ static void test_cuda_mean_reductions() {
|
||||
|
||||
static void test_cuda_product_reductions() {
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
@ -10,19 +10,20 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_gpu
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
using Eigen::Tensor;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
|
||||
template<int DataLayout>
|
||||
void test_cuda_contraction(int m_size, int k_size, int n_size)
|
||||
void test_gpu_contraction(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
|
||||
float* d_t_right;
|
||||
float* d_t_result;
|
||||
|
||||
cudaMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
cudaMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
cudaMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
gpuMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
gpuMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
gpuMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
|
||||
gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
|
||||
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
|
||||
gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
|
||||
continue;
|
||||
@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
|
||||
assert(false);
|
||||
}
|
||||
|
||||
cudaFree((void*)d_t_left);
|
||||
cudaFree((void*)d_t_right);
|
||||
cudaFree((void*)d_t_result);
|
||||
gpuFree((void*)d_t_left);
|
||||
gpuFree((void*)d_t_right);
|
||||
gpuFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
|
||||
float* d_t_right;
|
||||
float* d_t_result;
|
||||
|
||||
cudaMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
cudaMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
cudaMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
gpuMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
gpuMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
gpuMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
|
||||
gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
|
||||
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
|
||||
gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
|
||||
if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
|
||||
std::cout << "mismatch detected: " << t_result()
|
||||
@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
|
||||
assert(false);
|
||||
}
|
||||
|
||||
cudaFree((void*)d_t_left);
|
||||
cudaFree((void*)d_t_right);
|
||||
cudaFree((void*)d_t_result);
|
||||
gpuFree((void*)d_t_left);
|
||||
gpuFree((void*)d_t_right);
|
||||
gpuFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_cuda_contraction_m() {
|
||||
void test_gpu_contraction_m() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_cuda_contraction<ColMajor>(k, 128, 128);
|
||||
test_cuda_contraction<RowMajor>(k, 128, 128);
|
||||
test_gpu_contraction<ColMajor>(k, 128, 128);
|
||||
test_gpu_contraction<RowMajor>(k, 128, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout>
|
||||
void test_cuda_contraction_k() {
|
||||
void test_gpu_contraction_k() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_cuda_contraction<ColMajor>(128, k, 128);
|
||||
test_cuda_contraction<RowMajor>(128, k, 128);
|
||||
test_gpu_contraction<ColMajor>(128, k, 128);
|
||||
test_gpu_contraction<RowMajor>(128, k, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout>
|
||||
void test_cuda_contraction_n() {
|
||||
void test_gpu_contraction_n() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_cuda_contraction<ColMajor>(128, 128, k);
|
||||
test_cuda_contraction<RowMajor>(128, 128, k);
|
||||
test_gpu_contraction<ColMajor>(128, 128, k);
|
||||
test_gpu_contraction<RowMajor>(128, 128, k);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_cuda_contraction_sizes() {
|
||||
void test_gpu_contraction_sizes() {
|
||||
int m_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257 , 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() {
|
||||
for (int i = 0; i < 15; i++) {
|
||||
for (int j = 0; j < 15; j++) {
|
||||
for (int k = 0; k < 17; k++) {
|
||||
test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
|
||||
test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_cuda()
|
||||
void test_cxx11_tensor_gpu()
|
||||
{
|
||||
CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
|
||||
|
||||
CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
|
||||
|
||||
CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
|
||||
CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
|
||||
CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
|
||||
CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
|
||||
|
||||
CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
|
||||
CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
|
||||
CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
|
||||
CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
|
||||
|
||||
CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
|
||||
CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
|
||||
CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
|
||||
CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
|
||||
|
||||
CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
|
||||
CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
|
||||
#if !defined(EIGEN_USE_HIP)
|
||||
// disable these subtests for HIP
|
||||
CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
|
||||
CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
|
||||
#endif
|
||||
}
|
@ -16,6 +16,7 @@
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
using Eigen::Tensor;
|
||||
using Eigen::RowMajor;
|
||||
@ -66,22 +67,22 @@ struct CPUContext {
|
||||
// Context for evaluation on GPU
|
||||
struct GPUContext {
|
||||
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
|
||||
assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
|
||||
assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
|
||||
float kernel_1d_val[] = {3.14f, 2.7f};
|
||||
assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
|
||||
assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
|
||||
|
||||
assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
|
||||
assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
|
||||
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
|
||||
assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
|
||||
assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
|
||||
|
||||
assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
|
||||
assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
|
||||
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
|
||||
assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
|
||||
assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
|
||||
}
|
||||
~GPUContext() {
|
||||
assert(cudaFree(kernel_1d_) == cudaSuccess);
|
||||
assert(cudaFree(kernel_2d_) == cudaSuccess);
|
||||
assert(cudaFree(kernel_3d_) == cudaSuccess);
|
||||
assert(gpuFree(kernel_1d_) == gpuSuccess);
|
||||
assert(gpuFree(kernel_2d_) == gpuSuccess);
|
||||
assert(gpuFree(kernel_3d_) == gpuSuccess);
|
||||
}
|
||||
|
||||
const Eigen::GpuDevice& device() const { return gpu_device_; }
|
||||
@ -102,7 +103,7 @@ struct GPUContext {
|
||||
float* kernel_2d_;
|
||||
float* kernel_3d_;
|
||||
|
||||
Eigen::CudaStreamDevice stream_;
|
||||
Eigen::GpuStreamDevice stream_;
|
||||
Eigen::GpuDevice gpu_device_;
|
||||
};
|
||||
|
||||
@ -281,12 +282,12 @@ void test_gpu() {
|
||||
float* d_in1;
|
||||
float* d_in2;
|
||||
float* d_out;
|
||||
cudaMalloc((void**)(&d_in1), in1_bytes);
|
||||
cudaMalloc((void**)(&d_in2), in2_bytes);
|
||||
cudaMalloc((void**)(&d_out), out_bytes);
|
||||
gpuMalloc((void**)(&d_in1), in1_bytes);
|
||||
gpuMalloc((void**)(&d_in2), in2_bytes);
|
||||
gpuMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
|
||||
gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
|
||||
@ -294,7 +295,7 @@ void test_gpu() {
|
||||
|
||||
GPUContext context(gpu_in1, gpu_in2, gpu_out);
|
||||
test_contextual_eval(&context);
|
||||
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
|
||||
assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
@ -304,7 +305,7 @@ void test_gpu() {
|
||||
}
|
||||
|
||||
test_forced_contextual_eval(&context);
|
||||
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
|
||||
assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
@ -314,7 +315,7 @@ void test_gpu() {
|
||||
}
|
||||
|
||||
test_compound_assignment(&context);
|
||||
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
|
||||
assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
@ -324,7 +325,7 @@ void test_gpu() {
|
||||
}
|
||||
|
||||
test_contraction(&context);
|
||||
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
|
||||
assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 40; ++j) {
|
||||
const float result = out(i,j,0);
|
||||
@ -339,8 +340,8 @@ void test_gpu() {
|
||||
}
|
||||
|
||||
test_1d_convolution(&context);
|
||||
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
@ -350,8 +351,8 @@ void test_gpu() {
|
||||
}
|
||||
|
||||
test_2d_convolution(&context);
|
||||
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
@ -363,9 +364,13 @@ void test_gpu() {
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(EIGEN_USE_HIP)
|
||||
// disable this test on the HIP platform
|
||||
// 3D tensor convolutions seem to hang on the HIP platform
|
||||
|
||||
test_3d_convolution(&context);
|
||||
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
|
||||
for (int i = 0; i < 39; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
@ -378,6 +383,9 @@ void test_gpu() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,7 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_gpu
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -20,8 +20,8 @@
|
||||
using Eigen::Tensor;
|
||||
|
||||
template<typename>
|
||||
void test_cuda_numext() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_numext() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -57,11 +57,11 @@ void test_cuda_numext() {
|
||||
}
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
|
||||
template<typename>
|
||||
void test_cuda_conversion() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_conversion() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -95,8 +95,8 @@ void test_cuda_conversion() {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_unary() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_unary() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -132,8 +132,8 @@ void test_cuda_unary() {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_elementwise() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_elementwise() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -174,8 +174,8 @@ void test_cuda_elementwise() {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_trancendental() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_trancendental() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -268,8 +268,8 @@ void test_cuda_trancendental() {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_contractions() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_contractions() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int rows = 23;
|
||||
int cols = 23;
|
||||
@ -319,12 +319,12 @@ void test_cuda_contractions() {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_reductions(int size1, int size2, int redux) {
|
||||
void test_gpu_reductions(int size1, int size2, int redux) {
|
||||
|
||||
std::cout << "Reducing " << size1 << " by " << size2
|
||||
<< " tensor along dim " << redux << std::endl;
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = size1*size2;
|
||||
int result_size = (redux == 1 ? size1 : size2);
|
||||
@ -368,20 +368,20 @@ void test_cuda_reductions(int size1, int size2, int redux) {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_reductions() {
|
||||
test_cuda_reductions<void>(13, 13, 0);
|
||||
test_cuda_reductions<void>(13, 13, 1);
|
||||
void test_gpu_reductions() {
|
||||
test_gpu_reductions<void>(13, 13, 0);
|
||||
test_gpu_reductions<void>(13, 13, 1);
|
||||
|
||||
test_cuda_reductions<void>(35, 36, 0);
|
||||
test_cuda_reductions<void>(35, 36, 1);
|
||||
test_gpu_reductions<void>(35, 36, 0);
|
||||
test_gpu_reductions<void>(35, 36, 1);
|
||||
|
||||
test_cuda_reductions<void>(36, 35, 0);
|
||||
test_cuda_reductions<void>(36, 35, 1);
|
||||
test_gpu_reductions<void>(36, 35, 0);
|
||||
test_gpu_reductions<void>(36, 35, 1);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_full_reductions() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
void test_gpu_full_reductions() {
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int size = 13;
|
||||
int num_elem = size*size;
|
||||
@ -429,9 +429,9 @@ void test_cuda_full_reductions() {
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_cuda_forced_evals() {
|
||||
void test_gpu_forced_evals() {
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
@ -479,20 +479,20 @@ void test_cuda_forced_evals() {
|
||||
#endif
|
||||
|
||||
|
||||
void test_cxx11_tensor_of_float16_cuda()
|
||||
void test_cxx11_tensor_of_float16_gpu()
|
||||
{
|
||||
CALL_SUBTEST_1(test_cuda_numext<void>());
|
||||
CALL_SUBTEST_1(test_gpu_numext<void>());
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
CALL_SUBTEST_1(test_cuda_conversion<void>());
|
||||
CALL_SUBTEST_1(test_cuda_unary<void>());
|
||||
CALL_SUBTEST_1(test_cuda_elementwise<void>());
|
||||
CALL_SUBTEST_1(test_cuda_trancendental<void>());
|
||||
CALL_SUBTEST_2(test_cuda_contractions<void>());
|
||||
CALL_SUBTEST_3(test_cuda_reductions<void>());
|
||||
CALL_SUBTEST_4(test_cuda_full_reductions<void>());
|
||||
CALL_SUBTEST_5(test_cuda_forced_evals<void>());
|
||||
#ifdef EIGEN_HAS_GPU_FP16
|
||||
CALL_SUBTEST_1(test_gpu_conversion<void>());
|
||||
CALL_SUBTEST_1(test_gpu_unary<void>());
|
||||
CALL_SUBTEST_1(test_gpu_elementwise<void>());
|
||||
CALL_SUBTEST_1(test_gpu_trancendental<void>());
|
||||
CALL_SUBTEST_2(test_gpu_contractions<void>());
|
||||
CALL_SUBTEST_3(test_gpu_reductions<void>());
|
||||
CALL_SUBTEST_4(test_gpu_full_reductions<void>());
|
||||
CALL_SUBTEST_5(test_gpu_forced_evals<void>());
|
||||
#else
|
||||
std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
|
||||
std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
|
||||
#endif
|
||||
}
|
@ -9,15 +9,16 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_random_gpu
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <Eigen/CXX11/Tensor>
|
||||
|
||||
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
void test_cuda_random_uniform()
|
||||
void test_gpu_random_uniform()
|
||||
{
|
||||
Tensor<float, 2> out(72,97);
|
||||
out.setZero();
|
||||
@ -25,24 +26,24 @@ void test_cuda_random_uniform()
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_out;
|
||||
cudaMalloc((void**)(&d_out), out_bytes);
|
||||
gpuMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_out.random();
|
||||
|
||||
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
|
||||
// For now we just check this code doesn't crash.
|
||||
// TODO: come up with a valid test of randomness
|
||||
}
|
||||
|
||||
|
||||
void test_cuda_random_normal()
|
||||
void test_gpu_random_normal()
|
||||
{
|
||||
Tensor<float, 2> out(72,97);
|
||||
out.setZero();
|
||||
@ -50,9 +51,9 @@ void test_cuda_random_normal()
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_out;
|
||||
cudaMalloc((void**)(&d_out), out_bytes);
|
||||
gpuMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
|
||||
@ -60,8 +61,8 @@ void test_cuda_random_normal()
|
||||
Eigen::internal::NormalRandomGenerator<float> gen(true);
|
||||
gpu_out.device(gpu_device) = gpu_out.random(gen);
|
||||
|
||||
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
|
||||
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
|
||||
assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
|
||||
assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
|
||||
}
|
||||
|
||||
static void test_complex()
|
||||
@ -77,9 +78,9 @@ static void test_complex()
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_random_cuda()
|
||||
void test_cxx11_tensor_random_gpu()
|
||||
{
|
||||
CALL_SUBTEST(test_cuda_random_uniform());
|
||||
CALL_SUBTEST(test_cuda_random_normal());
|
||||
CALL_SUBTEST(test_gpu_random_uniform());
|
||||
CALL_SUBTEST(test_gpu_random_normal());
|
||||
CALL_SUBTEST(test_complex());
|
||||
}
|
@ -9,7 +9,7 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_gpu
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
@ -19,7 +19,7 @@
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_full_reductions() {
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
||||
@ -67,7 +67,7 @@ static void test_first_dim_reductions() {
|
||||
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
|
||||
|
||||
// Create device
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice dev(&stream);
|
||||
|
||||
// Create data(T)
|
||||
@ -107,7 +107,7 @@ static void test_last_dim_reductions() {
|
||||
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
|
||||
|
||||
// Create device
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice dev(&stream);
|
||||
|
||||
// Create data
|
||||
@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_reduction_cuda() {
|
||||
void test_cxx11_tensor_reduction_gpu() {
|
||||
CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
|
||||
CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
|
@ -9,19 +9,20 @@
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_scan_gpu
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||
|
||||
using Eigen::Tensor;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
|
||||
template<int DataLayout>
|
||||
void test_cuda_cumsum(int m_size, int k_size, int n_size)
|
||||
void test_gpu_cumsum(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
|
||||
@ -36,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
|
||||
float* d_t_input;
|
||||
float* d_t_result;
|
||||
|
||||
cudaMalloc((void**)(&d_t_input), t_input_bytes);
|
||||
cudaMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
gpuMalloc((void**)(&d_t_input), t_input_bytes);
|
||||
gpuMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
|
||||
gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
|
||||
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
|
||||
@ -52,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
|
||||
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
|
||||
t_result = t_input.cumsum(1);
|
||||
|
||||
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
|
||||
gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
|
||||
continue;
|
||||
@ -65,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
|
||||
assert(false);
|
||||
}
|
||||
|
||||
cudaFree((void*)d_t_input);
|
||||
cudaFree((void*)d_t_result);
|
||||
gpuFree((void*)d_t_input);
|
||||
gpuFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_scan_cuda()
|
||||
void test_cxx11_tensor_scan_gpu()
|
||||
{
|
||||
CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user