mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-20 08:39:37 +08:00
updates based on PR feedback
There are two major changes (and a few minor ones which are not listed here...see PR discussion for details) 1. Eigen::half implementations for HIP and CUDA have been merged. This means that - `CUDA/Half.h` and `HIP/hcc/Half.h` got merged to a new file `GPU/Half.h` - `CUDA/PacketMathHalf.h` and `HIP/hcc/PacketMathHalf.h` got merged to a new file `GPU/PacketMathHalf.h` - `CUDA/TypeCasting.h` and `HIP/hcc/TypeCasting.h` got merged to a new file `GPU/TypeCasting.h` After this change the `HIP/hcc` directory only contains one file `math_constants.h`. That will go away too once that file becomes a part of the HIP install. 2. new macros EIGEN_GPUCC, EIGEN_GPU_COMPILE_PHASE and EIGEN_HAS_GPU_FP16 have been added and the code has been updated to use them where appropriate. - `EIGEN_GPUCC` is the same as `(EIGEN_CUDACC || EIGEN_HIPCC)` - `EIGEN_GPU_DEVICE_COMPILE` is the same as `(EIGEN_CUDA_ARCH || EIGEN_HIP_DEVICE_COMPILE)` - `EIGEN_HAS_GPU_FP16` is the same as `(EIGEN_HAS_CUDA_FP16 or EIGEN_HAS_HIP_FP16)`
This commit is contained in:
parent
ba972fb6b4
commit
b6cc0961b1
71
Eigen/Core
71
Eigen/Core
@ -99,6 +99,61 @@
|
|||||||
#define EIGEN_DONT_VECTORIZE
|
#define EIGEN_DONT_VECTORIZE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||||
|
//
|
||||||
|
// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
|
||||||
|
//
|
||||||
|
#define EIGEN_GPUCC
|
||||||
|
//
|
||||||
|
// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
|
||||||
|
// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
|
||||||
|
//
|
||||||
|
// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
|
||||||
|
// For those cases, the corresponding code should be guarded with
|
||||||
|
// #if defined(EIGEN_GPUCC)
|
||||||
|
// instead of
|
||||||
|
// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||||
|
//
|
||||||
|
// For cases where the tweak is specific to HIP, the code should be guarded with
|
||||||
|
// #if defined(EIGEN_HIPCC)
|
||||||
|
//
|
||||||
|
// For cases where the tweak is specific to CUDA, the code should be guarded with
|
||||||
|
// #if defined(EIGEN_CUDACC)
|
||||||
|
//
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
//
|
||||||
|
// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
|
||||||
|
//
|
||||||
|
#define EIGEN_GPU_COMPILE_PHASE
|
||||||
|
//
|
||||||
|
// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
|
||||||
|
// + one to compile the source for the "host" (ie CPU)
|
||||||
|
// + another to compile the source for the "device" (ie. GPU)
|
||||||
|
//
|
||||||
|
// Code that needs to enabled only during the either the "host" or "device" compilation phase
|
||||||
|
// needs to be guarded with a macro that indicates the current compilation phase
|
||||||
|
//
|
||||||
|
// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
|
||||||
|
// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
|
||||||
|
//
|
||||||
|
// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
|
||||||
|
// For those cases, the code should be guarded with
|
||||||
|
// #if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
|
// instead of
|
||||||
|
// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
//
|
||||||
|
// For cases where the tweak is specific to HIP, the code should be guarded with
|
||||||
|
// #if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
//
|
||||||
|
// For cases where the tweak is specific to CUDA, the code should be guarded with
|
||||||
|
// #if defined(EIGEN_CUDA_ARCH)
|
||||||
|
//
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// When compiling CUDA device code with NVCC, or HIP device code with HIPCC
|
// When compiling CUDA device code with NVCC, or HIP device code with HIPCC
|
||||||
// pull in math functions from the global namespace. In host mode, and when
|
// pull in math functions from the global namespace. In host mode, and when
|
||||||
// device doee with clang, use the std versions.
|
// device doee with clang, use the std versions.
|
||||||
@ -312,6 +367,10 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||||
|
#define EIGEN_HAS_GPU_FP16
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||||
#define EIGEN_HAS_OPENMP
|
#define EIGEN_HAS_OPENMP
|
||||||
#endif
|
#endif
|
||||||
@ -475,15 +534,9 @@ using std::ptrdiff_t;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Half float support
|
// Half float support
|
||||||
#if defined EIGEN_USE_HIP
|
#include "src/Core/arch/GPU/Half.h"
|
||||||
#include "src/Core/arch/HIP/hcc/Half.h"
|
#include "src/Core/arch/GPU/PacketMathHalf.h"
|
||||||
#include "src/Core/arch/HIP/hcc/PacketMathHalf.h"
|
#include "src/Core/arch/GPU/TypeCasting.h"
|
||||||
#include "src/Core/arch/HIP/hcc/TypeCasting.h"
|
|
||||||
#else
|
|
||||||
#include "src/Core/arch/CUDA/Half.h"
|
|
||||||
#include "src/Core/arch/CUDA/PacketMathHalf.h"
|
|
||||||
#include "src/Core/arch/CUDA/TypeCasting.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined EIGEN_VECTORIZE_CUDA
|
#if defined EIGEN_VECTORIZE_CUDA
|
||||||
#include "src/Core/arch/CUDA/PacketMath.h"
|
#include "src/Core/arch/CUDA/PacketMath.h"
|
||||||
|
@ -35,7 +35,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
|
|||||||
template<int Size, int MaxSize> struct product_size_category
|
template<int Size, int MaxSize> struct product_size_category
|
||||||
{
|
{
|
||||||
enum {
|
enum {
|
||||||
#ifndef EIGEN_CUDA_ARCH
|
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||||
is_large = MaxSize == Dynamic ||
|
is_large = MaxSize == Dynamic ||
|
||||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||||
|
@ -301,13 +301,11 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
|
|||||||
{ pstore(to, from); }
|
{ pstore(to, from); }
|
||||||
|
|
||||||
/** \internal tries to do cache prefetching of \a addr */
|
/** \internal tries to do cache prefetching of \a addr */
|
||||||
template<typename Scalar>
|
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||||
#if !defined(EIGEN_HIPCC)
|
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
inline void prefetch(const Scalar* addr)
|
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_CUDA_ARCH
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
// do nothing
|
||||||
|
#elif defined(EIGEN_CUDA_ARCH)
|
||||||
#if defined(__LP64__)
|
#if defined(__LP64__)
|
||||||
// 64-bit pointer operand constraint for inlined asm
|
// 64-bit pointer operand constraint for inlined asm
|
||||||
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
||||||
@ -534,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
|
|||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
// Eigen+CUDA does not support complexes.
|
// Eigen+CUDA does not support complexes.
|
||||||
#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
|
#if !defined(EIGEN_GPUCC)
|
||||||
|
|
||||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||||
|
@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
|
|||||||
|
|
||||||
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct real_impl<std::complex<T> >
|
struct real_impl<std::complex<T> >
|
||||||
{
|
{
|
||||||
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
|
|||||||
|
|
||||||
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct imag_impl<std::complex<T> >
|
struct imag_impl<std::complex<T> >
|
||||||
{
|
{
|
||||||
@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true>
|
|||||||
|
|
||||||
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
|
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct conj_impl<std::complex<T> >
|
struct conj_impl<std::complex<T> >
|
||||||
{
|
{
|
||||||
@ -773,9 +773,7 @@ EIGEN_DEVICE_FUNC
|
|||||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||||
isfinite_impl(const T& x)
|
isfinite_impl(const T& x)
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
return isfinite(x);
|
|
||||||
#elif defined(EIGEN_CUDA_ARCH)
|
|
||||||
return (::isfinite)(x);
|
return (::isfinite)(x);
|
||||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||||
using std::isfinite;
|
using std::isfinite;
|
||||||
@ -790,9 +788,7 @@ EIGEN_DEVICE_FUNC
|
|||||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||||
isinf_impl(const T& x)
|
isinf_impl(const T& x)
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
return isinf(x);
|
|
||||||
#elif defined(EIGEN_CUDA_ARCH)
|
|
||||||
return (::isinf)(x);
|
return (::isinf)(x);
|
||||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||||
using std::isinf;
|
using std::isinf;
|
||||||
@ -807,9 +803,7 @@ EIGEN_DEVICE_FUNC
|
|||||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||||
isnan_impl(const T& x)
|
isnan_impl(const T& x)
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
return isnan(x);
|
|
||||||
#elif defined(EIGEN_CUDA_ARCH)
|
|
||||||
return (::isnan)(x);
|
return (::isnan)(x);
|
||||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||||
using std::isnan;
|
using std::isnan;
|
||||||
@ -875,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
|
|||||||
|
|
||||||
namespace numext {
|
namespace numext {
|
||||||
|
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
template<typename T>
|
template<typename T>
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||||
@ -1089,7 +1083,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float log1p(const float &x) { return ::log1pf(x); }
|
float log1p(const float &x) { return ::log1pf(x); }
|
||||||
|
|
||||||
@ -1147,7 +1141,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float floor(const float &x) { return ::floorf(x); }
|
float floor(const float &x) { return ::floorf(x); }
|
||||||
|
|
||||||
@ -1168,7 +1162,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float ceil(const float &x) { return ::ceilf(x); }
|
float ceil(const float &x) { return ::ceilf(x); }
|
||||||
|
|
||||||
@ -1226,7 +1220,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
|
|||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float log(const float &x) { return ::logf(x); }
|
float log(const float &x) { return ::logf(x); }
|
||||||
|
|
||||||
@ -1254,7 +1248,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float abs(const float &x) { return ::fabsf(x); }
|
float abs(const float &x) { return ::fabsf(x); }
|
||||||
|
|
||||||
@ -1284,7 +1278,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float exp(const float &x) { return ::expf(x); }
|
float exp(const float &x) { return ::expf(x); }
|
||||||
|
|
||||||
@ -1320,7 +1314,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float expm1(const float &x) { return ::expm1f(x); }
|
float expm1(const float &x) { return ::expm1f(x); }
|
||||||
|
|
||||||
@ -1340,7 +1334,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float cos(const float &x) { return ::cosf(x); }
|
float cos(const float &x) { return ::cosf(x); }
|
||||||
|
|
||||||
@ -1360,7 +1354,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float sin(const float &x) { return ::sinf(x); }
|
float sin(const float &x) { return ::sinf(x); }
|
||||||
|
|
||||||
@ -1380,7 +1374,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float tan(const float &x) { return ::tanf(x); }
|
float tan(const float &x) { return ::tanf(x); }
|
||||||
|
|
||||||
@ -1411,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
|
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float acos(const float &x) { return ::acosf(x); }
|
float acos(const float &x) { return ::acosf(x); }
|
||||||
|
|
||||||
@ -1442,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
|
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float asin(const float &x) { return ::asinf(x); }
|
float asin(const float &x) { return ::asinf(x); }
|
||||||
|
|
||||||
@ -1473,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
|
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float atan(const float &x) { return ::atanf(x); }
|
float atan(const float &x) { return ::atanf(x); }
|
||||||
|
|
||||||
@ -1494,7 +1488,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float cosh(const float &x) { return ::coshf(x); }
|
float cosh(const float &x) { return ::coshf(x); }
|
||||||
|
|
||||||
@ -1514,7 +1508,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
|
|||||||
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float sinh(const float &x) { return ::sinhf(x); }
|
float sinh(const float &x) { return ::sinhf(x); }
|
||||||
|
|
||||||
@ -1532,12 +1526,12 @@ T tanh(const T &x) {
|
|||||||
#if defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SYCL_DEVICE_ONLY__)
|
||||||
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
||||||
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
||||||
#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH
|
#elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float tanh(const float &x) { return ::tanhf(x); }
|
float tanh(const float &x) { return ::tanhf(x); }
|
||||||
|
|
||||||
@ -1557,7 +1551,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
|
|||||||
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
||||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
float fmod(const float& a, const float& b) {
|
float fmod(const float& a, const float& b) {
|
||||||
|
@ -137,14 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
|
|||||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||||
{
|
{
|
||||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||||
<<<<<<< local
|
|
||||||
#if defined(EIGEN_HIPCC)
|
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
static EIGEN_STRONG_INLINE
|
|
||||||
=======
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
>>>>>>> other
|
|
||||||
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||||
{
|
{
|
||||||
Index dstRows = src.rows();
|
Index dstRows = src.rows();
|
||||||
@ -397,14 +390,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
|
|||||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||||
|
|
||||||
template<typename Dst>
|
template<typename Dst>
|
||||||
<<<<<<< local
|
|
||||||
#if defined(EIGEN_HIPCC)
|
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
|
||||||
=======
|
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||||
>>>>>>> other
|
|
||||||
{
|
{
|
||||||
// Same as: dst.noalias() = lhs.lazyProduct(rhs);
|
// Same as: dst.noalias() = lhs.lazyProduct(rhs);
|
||||||
// but easier on the compiler side
|
// but easier on the compiler side
|
||||||
@ -865,7 +851,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
|||||||
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef EIGEN_CUDACC
|
#ifndef EIGEN_GPUCC
|
||||||
template<int LoadMode,typename PacketType>
|
template<int LoadMode,typename PacketType>
|
||||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||||
{
|
{
|
||||||
@ -909,7 +895,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
|||||||
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef EIGEN_CUDACC
|
#ifndef EIGEN_GPUCC
|
||||||
template<int LoadMode,typename PacketType>
|
template<int LoadMode,typename PacketType>
|
||||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||||
{
|
{
|
||||||
|
@ -26,15 +26,15 @@
|
|||||||
|
|
||||||
|
|
||||||
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
|
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
|
||||||
// type Eigen::half (inheriting from CUDA's __half struct) with
|
// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
|
||||||
// operator overloads such that it behaves basically as an arithmetic
|
// operator overloads such that it behaves basically as an arithmetic
|
||||||
// type. It will be quite slow on CPUs (so it is recommended to stay
|
// type. It will be quite slow on CPUs (so it is recommended to stay
|
||||||
// in fp32 for CPUs, except for simple parameter conversions, I/O
|
// in fp32 for CPUs, except for simple parameter conversions, I/O
|
||||||
// to disk and the likes), but fast on GPUs.
|
// to disk and the likes), but fast on GPUs.
|
||||||
|
|
||||||
|
|
||||||
#ifndef EIGEN_HALF_CUDA_H
|
#ifndef EIGEN_HALF_GPU_H
|
||||||
#define EIGEN_HALF_CUDA_H
|
#define EIGEN_HALF_GPU_H
|
||||||
|
|
||||||
#if __cplusplus > 199711L
|
#if __cplusplus > 199711L
|
||||||
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
|
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
|
||||||
@ -49,16 +49,41 @@ struct half;
|
|||||||
|
|
||||||
namespace half_impl {
|
namespace half_impl {
|
||||||
|
|
||||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
#if !defined(EIGEN_HAS_GPU_FP16)
|
||||||
// Make our own __half_raw definition that is similar to CUDA's.
|
// Make our own __half_raw definition that is similar to CUDA's.
|
||||||
struct __half_raw {
|
struct __half_raw {
|
||||||
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||||
unsigned short x;
|
unsigned short x;
|
||||||
};
|
};
|
||||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
|
// Make a __half_raw definition that is
|
||||||
|
// ++ compatible with that of Eigen and
|
||||||
|
// ++ add a implcit conversion to the native __half of the old HIP implementation.
|
||||||
|
//
|
||||||
|
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
|
||||||
|
//
|
||||||
|
// In the old HIP implementation,
|
||||||
|
// ++ __half is a typedef of __fp16
|
||||||
|
// ++ the "__h*" routines take "__half" arguments
|
||||||
|
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
|
||||||
|
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
|
||||||
|
struct __half_raw {
|
||||||
|
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||||
|
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||||
|
union {
|
||||||
|
unsigned short x;
|
||||||
|
__half data;
|
||||||
|
};
|
||||||
|
operator __half(void) const { return data; }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||||
typedef __half __half_raw;
|
typedef __half __half_raw;
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
||||||
@ -69,8 +94,19 @@ struct half_base : public __half_raw {
|
|||||||
EIGEN_DEVICE_FUNC half_base() {}
|
EIGEN_DEVICE_FUNC half_base() {}
|
||||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||||
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
|
||||||
|
#if defined(EIGEN_HAS_GPU_FP16)
|
||||||
|
#if defined(EIGEN_HAS_HIP_FP16)
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
|
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
|
||||||
|
#else
|
||||||
|
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
|
||||||
|
#endif
|
||||||
|
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
|
||||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -78,18 +114,39 @@ struct half_base : public __half_raw {
|
|||||||
|
|
||||||
// Class definition.
|
// Class definition.
|
||||||
struct half : public half_impl::half_base {
|
struct half : public half_impl::half_base {
|
||||||
#if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
|
||||||
|
// Writing this out as separate #if-else blocks to make the code easier to follow
|
||||||
|
// The same applies to most #if-else blocks in this file
|
||||||
|
#if !defined(EIGEN_HAS_GPU_FP16)
|
||||||
|
typedef half_impl::__half_raw __half_raw;
|
||||||
|
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
typedef half_impl::__half_raw __half_raw;
|
typedef half_impl::__half_raw __half_raw;
|
||||||
#endif
|
#endif
|
||||||
|
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
|
||||||
|
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||||
|
typedef half_impl::__half_raw __half_raw;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC half() {}
|
EIGEN_DEVICE_FUNC half() {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
|
||||||
|
#if defined(EIGEN_HAS_GPU_FP16)
|
||||||
|
#if defined(EIGEN_HAS_HIP_FP16)
|
||||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||||
|
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||||
|
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||||
|
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||||
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||||
template<class T>
|
template<class T>
|
||||||
@ -201,7 +258,8 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace half_impl {
|
namespace half_impl {
|
||||||
|
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
||||||
|
|
||||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||||
@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
|||||||
|
|
||||||
#else // Emulate support for half floats
|
#else // Emulate support for half floats
|
||||||
|
|
||||||
// Definitions for CPUs and older CUDA, mostly working through conversion
|
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
|
||||||
// to/from fp32.
|
// to/from fp32.
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
|
||||||
@ -342,7 +400,8 @@ union FP32 {
|
|||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
__half tmp_ff = __float2half(ff);
|
__half tmp_ff = __float2half(ff);
|
||||||
return *(__half_raw*)&tmp_ff;
|
return *(__half_raw*)&tmp_ff;
|
||||||
|
|
||||||
@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __half2float(h);
|
return __half2float(h);
|
||||||
|
|
||||||
#elif defined(EIGEN_HAS_FP16_C)
|
#elif defined(EIGEN_HAS_FP16_C)
|
||||||
@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
|
|||||||
return (a.x & 0x7fff) == 0x7c00;
|
return (a.x & 0x7fff) == 0x7c00;
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __hisnan(a);
|
return __hisnan(a);
|
||||||
#else
|
#else
|
||||||
return (a.x & 0x7fff) > 0x7c00;
|
return (a.x & 0x7fff) > 0x7c00;
|
||||||
@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
return half(hexp(a));
|
return half(hexp(a));
|
||||||
#else
|
#else
|
||||||
return half(::expf(float(a)));
|
return half(::expf(float(a)));
|
||||||
@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
|
|||||||
return half(numext::expm1(float(a)));
|
return half(numext::expm1(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return half(::hlog(a));
|
return half(::hlog(a));
|
||||||
#else
|
#else
|
||||||
return half(::logf(float(a)));
|
return half(::logf(float(a)));
|
||||||
@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
|||||||
return half(::log10f(float(a)));
|
return half(::log10f(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
return half(hsqrt(a));
|
return half(hsqrt(a));
|
||||||
#else
|
#else
|
||||||
return half(::sqrtf(float(a)));
|
return half(::sqrtf(float(a)));
|
||||||
@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
|||||||
return half(::tanhf(float(a)));
|
return half(::tanhf(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
return half(hfloor(a));
|
return half(hfloor(a));
|
||||||
#else
|
#else
|
||||||
return half(::floorf(float(a)));
|
return half(::floorf(float(a)));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
|
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
return half(hceil(a));
|
return half(hceil(a));
|
||||||
#else
|
#else
|
||||||
return half(::ceilf(float(a)));
|
return half(::ceilf(float(a)));
|
||||||
@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __hlt(b, a) ? b : a;
|
return __hlt(b, a) ? b : a;
|
||||||
#else
|
#else
|
||||||
const float f1 = static_cast<float>(a);
|
const float f1 = static_cast<float>(a);
|
||||||
@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __hlt(a, b) ? b : a;
|
return __hlt(a, b) ? b : a;
|
||||||
#else
|
#else
|
||||||
const float f1 = static_cast<float>(a);
|
const float f1 = static_cast<float>(a);
|
||||||
@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
|||||||
return Eigen::half(::expf(float(a)));
|
return Eigen::half(::expf(float(a)));
|
||||||
}
|
}
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||||
#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
|
#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
return Eigen::half(::hlog(a));
|
return Eigen::half(::hlog(a));
|
||||||
#else
|
#else
|
||||||
return Eigen::half(::logf(float(a)));
|
return Eigen::half(::logf(float(a)));
|
||||||
@ -629,9 +698,12 @@ struct hash<Eigen::half> {
|
|||||||
|
|
||||||
|
|
||||||
// Add the missing shfl_xor intrinsic
|
// Add the missing shfl_xor intrinsic
|
||||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||||
#if EIGEN_CUDACC_VER < 90000
|
#if (EIGEN_CUDACC_VER < 90000) || \
|
||||||
|
defined(EIGEN_HAS_HIP_FP16)
|
||||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||||
#else
|
#else
|
||||||
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||||
@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||||
return Eigen::half_impl::raw_uint16_to_half(
|
return Eigen::half_impl::raw_uint16_to_half(
|
||||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||||
@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace numext {
|
namespace numext {
|
||||||
|
|
||||||
@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) {
|
|||||||
} // namespace numext
|
} // namespace numext
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // EIGEN_HALF_CUDA_H
|
#endif // EIGEN_HALF_GPU_H
|
||||||
|
@ -7,15 +7,16 @@
|
|||||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
|
#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
|
||||||
#define EIGEN_PACKET_MATH_HALF_CUDA_H
|
#define EIGEN_PACKET_MATH_HALF_GPU_H
|
||||||
|
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// Most of the following operations require arch >= 3.0
|
// Most of the following operations require arch >= 3.0
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
|
|
||||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||||
|
|
||||||
@ -43,7 +44,18 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
|||||||
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||||
|
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
|
return half2half2(from);
|
||||||
|
#else
|
||||||
return __half2half2(from);
|
return __half2half2(from);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
return __half2half2(from);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||||
@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half*
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||||
|
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
|
return __halves2half2((*(from+0)), (*(from+1)));
|
||||||
|
#else
|
||||||
|
return __ldg((const half2*)from);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 350
|
#if EIGEN_CUDA_ARCH >= 350
|
||||||
return __ldg((const half2*)from);
|
return __ldg((const half2*)from);
|
||||||
#else
|
#else
|
||||||
return __halves2half2(*(from+0), *(from+1));
|
return __halves2half2(*(from+0), *(from+1));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||||
|
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
|
return __halves2half2((*(from+0)), (*(from+1)));
|
||||||
|
#else
|
||||||
|
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 350
|
#if EIGEN_CUDA_ARCH >= 350
|
||||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||||
#else
|
#else
|
||||||
return __halves2half2(*(from+0), *(from+1));
|
return __halves2half2(*(from+0), *(from+1));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||||
@ -117,15 +155,29 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||||
#else
|
#else
|
||||||
float f = __half2float(a) + 1.0f;
|
float f = __half2float(a) + 1.0f;
|
||||||
return __halves2half2(a, __float2half(f));
|
return __halves2half2(a, __float2half(f));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hadd2(a, b);
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hadd2(a, b);
|
return __hadd2(a, b);
|
||||||
#else
|
#else
|
||||||
@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
|
|||||||
float r2 = a2 + b2;
|
float r2 = a2 + b2;
|
||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hsub2(a, b);
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hsub2(a, b);
|
return __hsub2(a, b);
|
||||||
#else
|
#else
|
||||||
@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
|
|||||||
float r2 = a2 - b2;
|
float r2 = a2 - b2;
|
||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hneg2(a);
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hneg2(a);
|
return __hneg2(a);
|
||||||
#else
|
#else
|
||||||
@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
|||||||
float a2 = __high2float(a);
|
float a2 = __high2float(a);
|
||||||
return __floats2half2_rn(-a1, -a2);
|
return __floats2half2_rn(-a1, -a2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hmul2(a, b);
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hmul2(a, b);
|
return __hmul2(a, b);
|
||||||
#else
|
#else
|
||||||
@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
|
|||||||
float r2 = a2 * b2;
|
float r2 = a2 * b2;
|
||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hfma2(a, b, c);
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hfma2(a, b, c);
|
return __hfma2(a, b, c);
|
||||||
#else
|
#else
|
||||||
@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
|
|||||||
float r2 = a2 * b2 + c2;
|
float r2 = a2 * b2 + c2;
|
||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||||
|
return h2div(a, b);
|
||||||
|
#else
|
||||||
|
return __h2div(a, b);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
float a1 = __low2float(a);
|
float a1 = __low2float(a);
|
||||||
float a2 = __high2float(a);
|
float a2 = __high2float(a);
|
||||||
float b1 = __low2float(b);
|
float b1 = __low2float(b);
|
||||||
@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
|
|||||||
float r1 = a1 / b1;
|
float r1 = a1 / b1;
|
||||||
float r2 = a2 / b2;
|
float r2 = a2 / b2;
|
||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||||
@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hadd(__low2half(a), __high2half(a));
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hadd(__low2half(a), __high2half(a));
|
return __hadd(__low2half(a), __high2half(a));
|
||||||
#else
|
#else
|
||||||
@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
|
|||||||
float a2 = __high2float(a);
|
float a2 = __high2float(a);
|
||||||
return Eigen::half(__float2half(a1 + a2));
|
return Eigen::half(__float2half(a1 + a2));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
__half first = __low2half(a);
|
||||||
|
__half second = __high2half(a);
|
||||||
|
return __hgt(first, second) ? first : second;
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
__half first = __low2half(a);
|
__half first = __low2half(a);
|
||||||
__half second = __high2half(a);
|
__half second = __high2half(a);
|
||||||
@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
|
|||||||
float a2 = __high2float(a);
|
float a2 = __high2float(a);
|
||||||
return a1 > a2 ? __low2half(a) : __high2half(a);
|
return a1 > a2 ? __low2half(a) : __high2half(a);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
__half first = __low2half(a);
|
||||||
|
__half second = __high2half(a);
|
||||||
|
return __hlt(first, second) ? first : second;
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
__half first = __low2half(a);
|
__half first = __low2half(a);
|
||||||
__half second = __high2half(a);
|
__half second = __high2half(a);
|
||||||
@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
|
|||||||
float a2 = __high2float(a);
|
float a2 = __high2float(a);
|
||||||
return a1 < a2 ? __low2half(a) : __high2half(a);
|
return a1 < a2 ? __low2half(a) : __high2half(a);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
|
return __hmul(__low2half(a), __high2half(a));
|
||||||
|
|
||||||
|
#else // EIGEN_CUDA_ARCH
|
||||||
|
|
||||||
#if EIGEN_CUDA_ARCH >= 530
|
#if EIGEN_CUDA_ARCH >= 530
|
||||||
return __hmul(__low2half(a), __high2half(a));
|
return __hmul(__low2half(a), __high2half(a));
|
||||||
#else
|
#else
|
||||||
@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
|
|||||||
float a2 = __high2float(a);
|
float a2 = __high2float(a);
|
||||||
return Eigen::half(__float2half(a1 * a2));
|
return Eigen::half(__float2half(a1 * a2));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||||
@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
|||||||
return __floats2half2_rn(r1, r2);
|
return __floats2half2_rn(r1, r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
|
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||||
|
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
template<> __device__ EIGEN_STRONG_INLINE
|
template<> __device__ EIGEN_STRONG_INLINE
|
||||||
half2 plog<half2>(const half2& a) {
|
half2 plog<half2>(const half2& a) {
|
||||||
@ -1130,4 +1265,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
|
#endif // EIGEN_PACKET_MATH_HALF_GPU_H
|
||||||
|
@ -7,8 +7,8 @@
|
|||||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
#ifndef EIGEN_TYPE_CASTING_CUDA_H
|
#ifndef EIGEN_TYPE_CASTING_GPU_H
|
||||||
#define EIGEN_TYPE_CASTING_CUDA_H
|
#define EIGEN_TYPE_CASTING_GPU_H
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||||
typedef Eigen::half result_type;
|
typedef Eigen::half result_type;
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __float2half(a);
|
return __float2half(a);
|
||||||
#else
|
#else
|
||||||
return Eigen::half(a);
|
return Eigen::half(a);
|
||||||
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||||
typedef Eigen::half result_type;
|
typedef Eigen::half result_type;
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __float2half(static_cast<float>(a));
|
return __float2half(static_cast<float>(a));
|
||||||
#else
|
#else
|
||||||
return Eigen::half(static_cast<float>(a));
|
return Eigen::half(static_cast<float>(a));
|
||||||
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||||
typedef float result_type;
|
typedef float result_type;
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
return __half2float(a);
|
return __half2float(a);
|
||||||
#else
|
#else
|
||||||
return static_cast<float>(a);
|
return static_cast<float>(a);
|
||||||
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
|
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||||
|
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct type_casting_traits<Eigen::half, float> {
|
struct type_casting_traits<Eigen::half, float> {
|
||||||
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
|
|||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_TYPE_CASTING_CUDA_H
|
#endif // EIGEN_TYPE_CASTING_GPU_H
|
||||||
|
@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
|
|||||||
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
|
EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_CUDACC
|
#ifdef EIGEN_GPUCC
|
||||||
// FIXME is there some kind of cuda::swap?
|
// FIXME is there some kind of cuda::swap?
|
||||||
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
|
Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
|
||||||
#else
|
#else
|
||||||
|
@ -436,10 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
|
|||||||
typedef typename BinaryOp::second_argument_type second_argument_type;
|
typedef typename BinaryOp::second_argument_type second_argument_type;
|
||||||
typedef typename BinaryOp::result_type result_type;
|
typedef typename BinaryOp::result_type result_type;
|
||||||
|
|
||||||
#if defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
|
||||||
EIGEN_DEVICE_FUNC explicit
|
|
||||||
#endif
|
|
||||||
bind1st_op(const first_argument_type &val) : m_value(val) {}
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
|
||||||
|
|
||||||
@ -458,10 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
|
|||||||
typedef typename BinaryOp::second_argument_type second_argument_type;
|
typedef typename BinaryOp::second_argument_type second_argument_type;
|
||||||
typedef typename BinaryOp::result_type result_type;
|
typedef typename BinaryOp::result_type result_type;
|
||||||
|
|
||||||
#if defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
|
||||||
EIGEN_DEVICE_FUNC explicit
|
|
||||||
#endif
|
|
||||||
bind2nd_op(const second_argument_type &val) : m_value(val) {}
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
|
||||||
|
|
||||||
|
@ -163,10 +163,7 @@ class BlasLinearMapper {
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
|
||||||
|
|
||||||
#if !defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
|
||||||
internal::prefetch(&operator()(i));
|
internal::prefetch(&operator()(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -171,7 +171,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
|
|||||||
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
|
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
|
||||||
|
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
result = aligned_malloc(size);
|
result = ::malloc(size);
|
||||||
#else
|
#else
|
||||||
result = std::malloc(size);
|
result = std::malloc(size);
|
||||||
#endif
|
#endif
|
||||||
@ -195,7 +195,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
|
|||||||
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
|
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
|
||||||
|
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
aligned_free(ptr);
|
::free(ptr);
|
||||||
#else
|
#else
|
||||||
std::free(ptr);
|
std::free(ptr);
|
||||||
#endif
|
#endif
|
||||||
@ -244,7 +244,7 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
|
|||||||
check_that_malloc_is_allowed();
|
check_that_malloc_is_allowed();
|
||||||
|
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
void *result = aligned_malloc(size);
|
void *result = ::malloc(size);
|
||||||
#else
|
#else
|
||||||
void *result = std::malloc(size);
|
void *result = std::malloc(size);
|
||||||
#endif
|
#endif
|
||||||
@ -263,7 +263,7 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
|
|||||||
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
|
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
aligned_free(ptr);
|
::free(ptr);
|
||||||
#else
|
#else
|
||||||
std::free(ptr);
|
std::free(ptr);
|
||||||
#endif
|
#endif
|
||||||
|
@ -11,16 +11,19 @@
|
|||||||
#ifndef EIGEN_META_H
|
#ifndef EIGEN_META_H
|
||||||
#define EIGEN_META_H
|
#define EIGEN_META_H
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
#include <cfloat>
|
|
||||||
#include <math_constants.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#include <cfloat>
|
||||||
#include <cfloat>
|
|
||||||
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#if defined(EIGEN_CUDA_ARCH)
|
||||||
|
#include <math_constants.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
|
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@ -181,7 +184,7 @@ template<bool Condition, typename T=void> struct enable_if;
|
|||||||
template<typename T> struct enable_if<true,T>
|
template<typename T> struct enable_if<true,T>
|
||||||
{ typedef T type; };
|
{ typedef T type; };
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
#if !defined(__FLT_EPSILON__)
|
#if !defined(__FLT_EPSILON__)
|
||||||
#define __FLT_EPSILON__ FLT_EPSILON
|
#define __FLT_EPSILON__ FLT_EPSILON
|
||||||
#define __DBL_EPSILON__ DBL_EPSILON
|
#define __DBL_EPSILON__ DBL_EPSILON
|
||||||
@ -565,13 +568,13 @@ template<typename T, typename U> struct scalar_product_traits
|
|||||||
|
|
||||||
namespace numext {
|
namespace numext {
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
|
||||||
#else
|
#else
|
||||||
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
|
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
using internal::device::numeric_limits;
|
using internal::device::numeric_limits;
|
||||||
#else
|
#else
|
||||||
using std::numeric_limits;
|
using std::numeric_limits;
|
||||||
@ -590,7 +593,7 @@ T div_ceil(const T &a, const T &b)
|
|||||||
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||||
bool equal_strict(const X& x,const Y& y) { return x == y; }
|
bool equal_strict(const X& x,const Y& y) { return x == y; }
|
||||||
|
|
||||||
#if !defined(EIGEN_CUDA_ARCH)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||||
bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
|
bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
|
||||||
|
|
||||||
@ -601,7 +604,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double
|
|||||||
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||||
bool not_equal_strict(const X& x,const Y& y) { return x != y; }
|
bool not_equal_strict(const X& x,const Y& y) { return x != y; }
|
||||||
|
|
||||||
#if !defined(EIGEN_CUDA_ARCH)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||||
bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
|
bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
|
||||||
|
|
||||||
|
@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
|
|||||||
#endif
|
#endif
|
||||||
}//end deflation
|
}//end deflation
|
||||||
|
|
||||||
#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
|
#if !defined(EIGEN_GPUCC)
|
||||||
/** \svd_module
|
/** \svd_module
|
||||||
*
|
*
|
||||||
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
|
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
|
||||||
#include <Eigen/src/Core/arch/CUDA/Half.h>
|
#include <Eigen/src/Core/arch/GPU/Half.h>
|
||||||
|
|
||||||
// Make sure it's possible to forward declare Eigen::half
|
// Make sure it's possible to forward declare Eigen::half
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
16
test/main.h
16
test/main.h
@ -68,9 +68,19 @@
|
|||||||
// are defined here and any not-parenthesized min/max call will cause a
|
// are defined here and any not-parenthesized min/max call will cause a
|
||||||
// compiler error.
|
// compiler error.
|
||||||
#if !defined(__HIPCC__)
|
#if !defined(__HIPCC__)
|
||||||
// HIP headers include the <thread> header which contains not-parenthesized
|
//
|
||||||
// calls to "max", triggering the following check and causing the compile to fail
|
// HIP header files include the following files
|
||||||
// so disabling the following checks for HIP
|
// <thread>
|
||||||
|
// <regex>
|
||||||
|
// <unordered_map>
|
||||||
|
// which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail
|
||||||
|
//
|
||||||
|
// Including those header files before the following macro definition for "min" / "max", only partially resolves the issue
|
||||||
|
// This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other
|
||||||
|
// headers.
|
||||||
|
//
|
||||||
|
// So instead choosing to simply disable this check for HIP
|
||||||
|
//
|
||||||
#define min(A,B) please_protect_your_min_with_parentheses
|
#define min(A,B) please_protect_your_min_with_parentheses
|
||||||
#define max(A,B) please_protect_your_max_with_parentheses
|
#define max(A,B) please_protect_your_max_with_parentheses
|
||||||
#define isnan(X) please_protect_your_isnan_with_parentheses
|
#define isnan(X) please_protect_your_isnan_with_parentheses
|
||||||
|
@ -35,7 +35,7 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
// Running on the host CPU
|
// Running on the host CPU
|
||||||
return 1;
|
return 1;
|
||||||
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
@ -48,9 +48,12 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
// Running on the host CPU
|
// Running on the host CPU
|
||||||
return l1CacheSize();
|
return l1CacheSize();
|
||||||
|
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
// Running on a HIP device
|
||||||
|
return 48*1024; // FIXME : update this number for HIP
|
||||||
#else
|
#else
|
||||||
// Running on a CUDA device, return the amount of shared memory available.
|
// Running on a CUDA device, return the amount of shared memory available.
|
||||||
return 48*1024;
|
return 48*1024;
|
||||||
@ -58,9 +61,12 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
// Running single threaded on the host CPU
|
// Running single threaded on the host CPU
|
||||||
return l3CacheSize();
|
return l3CacheSize();
|
||||||
|
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
// Running on a HIP device
|
||||||
|
return firstLevelCacheSize(); // FIXME : update this number for HIP
|
||||||
#else
|
#else
|
||||||
// Running on a CUDA device
|
// Running on a CUDA device
|
||||||
return firstLevelCacheSize();
|
return firstLevelCacheSize();
|
||||||
@ -68,7 +74,7 @@ struct DefaultDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
// Running single threaded on the host CPU
|
// Running single threaded on the host CPU
|
||||||
// Should return an enum that encodes the ISA supported by the CPU
|
// Should return an enum that encodes the ISA supported by the CPU
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
#if defined(EIGEN_GPUCC)
|
||||||
template <typename Evaluator, typename Index, bool Vectorizable>
|
template <typename Evaluator, typename Index, bool Vectorizable>
|
||||||
struct EigenMetaKernelEval {
|
struct EigenMetaKernelEval {
|
||||||
static __device__ EIGEN_ALWAYS_INLINE
|
static __device__ EIGEN_ALWAYS_INLINE
|
||||||
@ -276,7 +276,7 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
|||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // EIGEN_CUDACC || EIGEN_HIPCC
|
#endif // EIGEN_GPUCC
|
||||||
#endif // EIGEN_USE_GPU
|
#endif // EIGEN_USE_GPU
|
||||||
|
|
||||||
// SYCL Executor policy
|
// SYCL Executor policy
|
||||||
|
@ -35,7 +35,7 @@ namespace {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_CUDA_ARCH
|
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||||
return __clz(val);
|
return __clz(val);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::clz(val);
|
return cl::sycl::clz(val);
|
||||||
@ -53,7 +53,7 @@ namespace {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
|
||||||
{
|
{
|
||||||
#ifdef EIGEN_CUDA_ARCH
|
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||||
return __clzll(val);
|
return __clzll(val);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::clz(val);
|
return cl::sycl::clz(val);
|
||||||
@ -90,7 +90,7 @@ namespace {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
|
||||||
#if defined(EIGEN_CUDA_ARCH)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
return __umulhi(a, b);
|
return __umulhi(a, b);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
|
||||||
@ -101,7 +101,7 @@ namespace {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||||
#if defined(EIGEN_CUDA_ARCH)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
return __umul64hi(a, b);
|
return __umul64hi(a, b);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
|
||||||
@ -124,7 +124,7 @@ namespace {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
struct DividerHelper<64, T> {
|
struct DividerHelper<64, T> {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
||||||
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
|
#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__)
|
||||||
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
||||||
#else
|
#else
|
||||||
const uint64_t shift = 1ULL << log_div;
|
const uint64_t shift = 1ULL << log_div;
|
||||||
@ -203,7 +203,7 @@ class TensorIntDivisor<int32_t, true> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
||||||
#ifdef EIGEN_CUDA_ARCH
|
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||||
return (__umulhi(magic, n) >> shift);
|
return (__umulhi(magic, n) >> shift);
|
||||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||||
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// SFINAE requires variadic templates
|
// SFINAE requires variadic templates
|
||||||
#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
|
#if !defined(EIGEN_GPUCC)
|
||||||
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
#if EIGEN_HAS_VARIADIC_TEMPLATES
|
||||||
// SFINAE doesn't work for gcc <= 4.7
|
// SFINAE doesn't work for gcc <= 4.7
|
||||||
#ifdef EIGEN_COMP_GNUC
|
#ifdef EIGEN_COMP_GNUC
|
||||||
|
@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// For CUDA packet types when using a GpuDevice
|
// For CUDA packet types when using a GpuDevice
|
||||||
#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16)))
|
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
|
||||||
template <>
|
template <>
|
||||||
struct PacketType<half, GpuDevice> {
|
struct PacketType<half, GpuDevice> {
|
||||||
typedef half2 type;
|
typedef half2 type;
|
||||||
|
@ -16,7 +16,7 @@ namespace internal {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
||||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
// We don't support 3d kernels since we currently only use 1 and
|
// We don't support 3d kernels since we currently only use 1 and
|
||||||
// 2d kernels.
|
// 2d kernels.
|
||||||
assert(threadIdx.z == 0);
|
assert(threadIdx.z == 0);
|
||||||
|
@ -334,12 +334,12 @@ struct OuterReducer {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
|
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||||
template <int B, int N, typename S, typename R, typename I>
|
template <int B, int N, typename S, typename R, typename I>
|
||||||
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||||
|
|
||||||
|
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
#if defined(EIGEN_HAS_GPU_FP16)
|
||||||
template <typename S, typename R, typename I>
|
template <typename S, typename R, typename I>
|
||||||
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||||
template <int B, int N, typename S, typename R, typename I>
|
template <int B, int N, typename S, typename R, typename I>
|
||||||
@ -698,9 +698,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
|
||||||
#endif
|
#endif
|
||||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
|
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||||
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
|
||||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
#if defined(EIGEN_HAS_GPU_FP16)
|
||||||
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
|
||||||
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
|
||||||
template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
|
template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
|
||||||
@ -793,7 +793,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
|
|||||||
Op m_reducer;
|
Op m_reducer;
|
||||||
|
|
||||||
// For full reductions
|
// For full reductions
|
||||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
|
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||||
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
|
||||||
static const bool RunningOnSycl = false;
|
static const bool RunningOnSycl = false;
|
||||||
#elif defined(EIGEN_USE_SYCL)
|
#elif defined(EIGEN_USE_SYCL)
|
||||||
|
@ -242,7 +242,7 @@ struct ScanLauncher {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
|
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
|
||||||
|
|
||||||
// GPU implementation of scan
|
// GPU implementation of scan
|
||||||
// TODO(ibab) This placeholder implementation performs multiple scans in
|
// TODO(ibab) This placeholder implementation performs multiple scans in
|
||||||
@ -286,7 +286,7 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC)
|
#endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
|
@ -268,10 +268,7 @@ template<
|
|||||||
typename Reducer
|
typename Reducer
|
||||||
> struct reduce<Reducer>
|
> struct reduce<Reducer>
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; }
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
constexpr static inline int run() { return Reducer::Identity; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<
|
template<
|
||||||
@ -279,10 +276,7 @@ template<
|
|||||||
typename A
|
typename A
|
||||||
> struct reduce<Reducer, A>
|
> struct reduce<Reducer, A>
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; }
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
constexpr static inline A run(A a) { return a; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<
|
template<
|
||||||
@ -291,10 +285,7 @@ template<
|
|||||||
typename... Ts
|
typename... Ts
|
||||||
> struct reduce<Reducer, A, Ts...>
|
> struct reduce<Reducer, A, Ts...>
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
|
|
||||||
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
|
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -333,10 +324,7 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
|
|||||||
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
|
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
|
||||||
// does...
|
// does...
|
||||||
template<typename... Ts>
|
template<typename... Ts>
|
||||||
#if defined(EIGEN_HIPCC)
|
EIGEN_DEVICE_FUNC constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
#endif
|
|
||||||
constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
|
|
||||||
{
|
{
|
||||||
return reduce<product_op, Ts...>::run(ts...);
|
return reduce<product_op, Ts...>::run(ts...);
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
// The array class is only available starting with cxx11. Emulate our own here
|
// The array class is only available starting with cxx11. Emulate our own here
|
||||||
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
|
||||||
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
|
||||||
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY)
|
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
template <typename T, size_t n> class array {
|
template <typename T, size_t n> class array {
|
||||||
|
@ -190,7 +190,7 @@ template <>
|
|||||||
struct lgamma_impl<float> {
|
struct lgamma_impl<float> {
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static EIGEN_STRONG_INLINE float run(float x) {
|
static EIGEN_STRONG_INLINE float run(float x) {
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||||
int dummy;
|
int dummy;
|
||||||
return ::lgammaf_r(x, &dummy);
|
return ::lgammaf_r(x, &dummy);
|
||||||
#else
|
#else
|
||||||
@ -203,7 +203,7 @@ template <>
|
|||||||
struct lgamma_impl<double> {
|
struct lgamma_impl<double> {
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
static EIGEN_STRONG_INLINE double run(double x) {
|
static EIGEN_STRONG_INLINE double run(double x) {
|
||||||
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
|
||||||
int dummy;
|
int dummy;
|
||||||
return ::lgamma_r(x, &dummy);
|
return ::lgamma_r(x, &dummy);
|
||||||
#else
|
#else
|
||||||
|
@ -17,7 +17,7 @@ namespace internal {
|
|||||||
// Make sure this is only available when targeting a GPU: we don't want to
|
// Make sure this is only available when targeting a GPU: we don't want to
|
||||||
// introduce conflicts between these packet_traits definitions and the ones
|
// introduce conflicts between these packet_traits definitions and the ones
|
||||||
// we'll use on the host side (SSE, AVX, ...)
|
// we'll use on the host side (SSE, AVX, ...)
|
||||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
float4 plgamma<float4>(const float4& a)
|
float4 plgamma<float4>(const float4& a)
|
||||||
|
@ -1222,9 +1222,8 @@ void test_cxx11_tensor_hip()
|
|||||||
CALL_SUBTEST(test_hip_elementwise());
|
CALL_SUBTEST(test_hip_elementwise());
|
||||||
CALL_SUBTEST(test_hip_props());
|
CALL_SUBTEST(test_hip_props());
|
||||||
CALL_SUBTEST(test_hip_reduction());
|
CALL_SUBTEST(test_hip_reduction());
|
||||||
// FIXME : uncommenting following tests results in compile failure
|
CALL_SUBTEST(test_hip_contraction<ColMajor>());
|
||||||
// CALL_SUBTEST(test_hip_contraction<ColMajor>());
|
CALL_SUBTEST(test_hip_contraction<RowMajor>());
|
||||||
// CALL_SUBTEST(test_hip_contraction<RowMajor>());
|
|
||||||
CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
|
CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
|
||||||
CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
|
CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
|
||||||
CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
|
CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user